diff --git a/.gitignore b/.gitignore
index ac56a3320ec85769d2c87c072512f5217eca0c24..fe0d13f4d9eab2c2a8e7001c9ecb69cce1333af1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,9 @@
+paddle/operators/check_t.save
+paddle/operators/check_tensor.ls
+paddle/operators/tensor.save
+python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
+python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
+python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
 *.DS_Store
 build/
 build_doc/
@@ -27,5 +33,5 @@ CMakeFiles
 cmake_install.cmake
 paddle/.timestamp
 python/paddlepaddle.egg-info/
-paddle/pybind/pybind.h
+paddle/fluid/pybind/pybind.h
 python/paddle/version.py
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a60453ff4e3bba6e6cb3b3de915dd69afd3a1ec3..3c36cffcb4eeaaf7f8cff5167777628dd2697e7d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,5 +1,8 @@
 # Contribute Code
 
+You are welcome to contribute to project PaddlePaddle. To contribute to PaddlePaddle, you have to agree with the 
+[PaddlePaddle Contributor License Agreement](https://gist.github.com/wangkuiyi/0c22c7b1bd3bb7eb27d76f85c3a3e329).
+
 We sincerely appreciate your contribution.  This document explains our workflow and work style.
 
 ## Workflow
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 6bea7cf3022242ce48cc882915f7e71810937283..de94bd5008effef1bf0fd3a125d4aed56e1b7f81 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -181,7 +181,8 @@ elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
 elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
     list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
-    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
+    # nvcc 9 does not support -Os. Use Release flags instead
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
 endif()
 
 mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 33ef6860e1d38f4e87c4431addf43f9f8a655fc2..1cb54ba2164fafbfce9f28a3e894ae5e78a9cd68 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -179,20 +179,24 @@ function(cc_library TARGET_NAME)
   set(oneValueArgs "")
   set(multiValueArgs SRCS DEPS)
   cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  if (cc_library_SRCS)
-    if (cc_library_SHARED OR cc_library_shared) # build *.so
+  if(cc_library_SRCS)
+    if(cc_library_SHARED OR cc_library_shared) # build *.so
       add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
     else()
       add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
     endif()
-    if (cc_library_DEPS)
+    if(cc_library_DEPS)
       # Don't need link libwarpctc.so
-      if ("${cc_library_DEPS};" MATCHES "warpctc;")
+      if("${cc_library_DEPS};" MATCHES "warpctc;")
         list(REMOVE_ITEM cc_library_DEPS warpctc)
         add_dependencies(${TARGET_NAME} warpctc)
       endif()
+      # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
+      target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
+      if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
+        list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
+      endif()
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
-      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
     endif()
     
     # cpplint code style
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index e24613b94b422b7cdf9c6383c359fa92a4faf6ff..58c493fd7412cf9dbe507c9622d67dae33a5fb25 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -323,6 +323,12 @@ batch_norm
 ..  autofunction:: paddle.v2.fluid.layers.batch_norm
     :noindex:
 
+layer_norm
+----------
+
+..  autofunction:: paddle.v2.fluid.layers.layer_norm
+    :noindex:
+
 beam_search_decode
 ------------------
 
diff --git a/doc/index_cn.rst b/doc/index_cn.rst
index 63a78428583477792e309a3b3d26af340caccfca..0f645db6fc5d0f84bbe0cbb335677752e3a355ea 100644
--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
@@ -8,5 +8,4 @@ PaddlePaddle 文档
   build_and_install/index_cn.rst
   howto/index_cn.rst
   dev/index_cn.rst
-  api/index_cn.rst
   faq/index_cn.rst
diff --git a/doc/index_en.rst b/doc/index_en.rst
index 5631381be087017c26b2a6a3984b3c5bdb49f12d..166f56c28f464563a0b36007f58cebb58c286916 100644
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -8,4 +8,3 @@ PaddlePaddle Documentation
   build_and_install/index_en.rst
   howto/index_en.rst
   dev/index_en.rst
-  api/index_en.rst
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 3f9c132ef6ae03c7614e10484715676c8019821e..c7deba2ab475d3c4f2c95327af77af7031b591fd 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -19,12 +19,7 @@ else()
   endif()
 
   if(NOT ANDROID AND NOT IOS)
-    add_subdirectory(memory)
-    add_subdirectory(platform)
-    add_subdirectory(framework)
-    add_subdirectory(operators)
-    add_subdirectory(pybind)
-    add_subdirectory(inference)
+    add_subdirectory(fluid)
   endif()
 
   if(WITH_SWIG_PY)
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a6b4191518c45d0579f800ecb901dcd9667e17d5
--- /dev/null
+++ b/paddle/fluid/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_subdirectory(memory)
+add_subdirectory(platform)
+add_subdirectory(framework)
+add_subdirectory(operators)
+add_subdirectory(pybind)
+add_subdirectory(inference)
diff --git a/paddle/framework/.clang-format b/paddle/fluid/framework/.clang-format
similarity index 100%
rename from paddle/framework/.clang-format
rename to paddle/fluid/framework/.clang-format
diff --git a/paddle/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
similarity index 100%
rename from paddle/framework/CMakeLists.txt
rename to paddle/fluid/framework/CMakeLists.txt
diff --git a/paddle/fluid/framework/attribute.cc b/paddle/fluid/framework/attribute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d7e7366b0723c630b24d62c1f5d0a72cf42d770
--- /dev/null
+++ b/paddle/fluid/framework/attribute.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/attribute.h"
+
+#include <vector>
+
+namespace paddle {
+namespace framework {
+
+Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
+  switch (attr_desc.type()) {
+    case proto::AttrType::BOOLEAN: {
+      return attr_desc.b();
+    }
+    case proto::AttrType::INT: {
+      return attr_desc.i();
+    }
+    case proto::AttrType::FLOAT: {
+      return attr_desc.f();
+    }
+    case proto::AttrType::STRING: {
+      return attr_desc.s();
+    }
+    case proto::AttrType::BOOLEANS: {
+      std::vector<bool> val(attr_desc.bools_size());
+      for (int i = 0; i < attr_desc.bools_size(); ++i) {
+        val[i] = attr_desc.bools(i);
+      }
+      return val;
+    }
+    case proto::AttrType::INTS: {
+      std::vector<int> val(attr_desc.ints_size());
+      for (int i = 0; i < attr_desc.ints_size(); ++i) {
+        val[i] = attr_desc.ints(i);
+      }
+      return val;
+    }
+    case proto::AttrType::FLOATS: {
+      std::vector<float> val(attr_desc.floats_size());
+      for (int i = 0; i < attr_desc.floats_size(); ++i) {
+        val[i] = attr_desc.floats(i);
+      }
+      return val;
+    }
+    case proto::AttrType::STRINGS: {
+      std::vector<std::string> val(attr_desc.strings_size());
+      for (int i = 0; i < attr_desc.strings_size(); ++i) {
+        val[i] = attr_desc.strings(i);
+      }
+      return val;
+    }
+    case proto::AttrType::LONG: {
+      return attr_desc.l();
+    }
+    default:
+      PADDLE_THROW("Unsupport attr type %d", attr_desc.type());
+  }
+  return boost::blank();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h
new file mode 100644
index 0000000000000000000000000000000000000000..16be42ae71497bcc755d10eee2d73d331ede7da6
--- /dev/null
+++ b/paddle/fluid/framework/attribute.h
@@ -0,0 +1,284 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+template <typename T>
+inline proto::AttrType AttrTypeID() {
+  Attribute tmp = T();
+  return static_cast<proto::AttrType>(tmp.which() - 1);
+}
+
+Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc);
+
+class AttrReader {
+ public:
+  explicit AttrReader(const AttributeMap& attrs) : attrs_(attrs) {}
+
+  template <typename T>
+  inline const T& Get(const std::string& name) const {
+    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
+                   name);
+    return boost::get<T>(attrs_.at(name));
+  }
+
+ private:
+  const AttributeMap& attrs_;
+};
+
+// check whether a value(attribute) fit a certain limit
+template <typename T>
+class GreaterThanChecker {
+ public:
+  explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
+  void operator()(T& value) const {
+    PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails.");
+  }
+
+ private:
+  T lower_bound_;
+};
+
+template <typename T>
+class EqualGreaterThanChecker {
+ public:
+  explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
+  void operator()(T& value) const {
+    PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails.");
+  }
+
+ private:
+  T lower_bound_;
+};
+
+// we can provide users more common Checker, like 'LessThanChecker',
+// 'BetweenChecker'...
+
+template <typename T>
+class DefaultValueSetter {
+ public:
+  explicit DefaultValueSetter(T default_value)
+      : default_value_(default_value) {}
+  void operator()(T& value) const { value = default_value_; }
+
+ private:
+  T default_value_;
+};
+
+template <typename T>
+class EnumInContainer {
+ public:
+  explicit EnumInContainer(const std::unordered_set<T>& c) : container_(c) {}
+  void operator()(T& val) const {
+    PADDLE_ENFORCE(container_.find(val) != container_.end(),
+                   "Value %s is not in enum container %s", val,
+                   ContainerDebugString());
+  }
+
+ private:
+  std::string ContainerDebugString() const {
+    std::ostringstream sout;
+    sout << "[";
+    size_t cnt = 0;
+    for (auto& v : container_) {
+      sout << v;
+      ++cnt;
+      if (cnt != container_.size()) {
+        sout << " ,";
+      }
+    }
+    sout << "]";
+    return sout.str();
+  }
+
+  std::unordered_set<T> container_;
+};
+
+template <typename T>
+struct ExtractAttribute {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  T* operator()(Attribute& attr) const {
+    T* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<T>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s",
+                   attr_name_, typeid(T).name(), attr.type().name());
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
+// special handle bool
+// FIXME(yuyang18): Currently we cast bool into int in python binding. It is
+// hard to change the logic there. In another way, we should correct handle
+// if the user set `some_flag=1`.
+//
+// FIX ME anytime if there is a better solution.
+template <>
+struct ExtractAttribute<bool> {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  bool* operator()(Attribute& attr) const {
+    if (attr.type() == typeid(int)) {  // NOLINT
+      int val = boost::get<int>(attr);
+      attr = static_cast<bool>(val);
+    } else if (attr.type() == typeid(float)) {  // NOLINT
+      float val = boost::get<float>(attr);
+      attr = static_cast<bool>(val);
+    }
+    bool* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<bool>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s",
+                   attr_name_, attr.type().name());
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
+template <>
+struct ExtractAttribute<int64_t> {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  int64_t* operator()(Attribute& attr) const {
+    if (attr.type() == typeid(int)) {  // NOLINT
+      int val = boost::get<int>(attr);
+      attr = static_cast<int64_t>(val);
+    } else if (attr.type() == typeid(float)) {  // NOLINT
+      int val = boost::get<float>(attr);
+      attr = static_cast<int64_t>(val);
+    }
+    int64_t* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<int64_t>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
+                   attr_name_, attr.type().name());
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
+// check whether a certain attribute fit its limits
+// an attribute can have more than one limits
+template <typename T>
+class TypedAttrChecker {
+  typedef std::function<void(T&)> ValueChecker;
+
+ public:
+  explicit TypedAttrChecker(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  TypedAttrChecker& InEnum(const std::unordered_set<T>& range) {
+    value_checkers_.push_back(EnumInContainer<T>(range));
+    return *this;
+  }
+
+  TypedAttrChecker& GreaterThan(const T& lower_bound) {
+    value_checkers_.push_back(GreaterThanChecker<T>(lower_bound));
+    return *this;
+  }
+
+  TypedAttrChecker& EqualGreaterThan(const T& lower_bound) {
+    value_checkers_.push_back(EqualGreaterThanChecker<T>(lower_bound));
+    return *this;
+  }
+
+  // we can add more common limits, like LessThan(), Between()...
+
+  TypedAttrChecker& SetDefault(const T& default_value) {
+    PADDLE_ENFORCE(default_value_setter_.empty(),
+                   "%s can't have more than one default value!", attr_name_);
+    default_value_setter_.push_back(DefaultValueSetter<T>(default_value));
+    return *this;
+  }
+
+  // allow users provide their own checker
+  TypedAttrChecker& AddCustomChecker(const ValueChecker& checker) {
+    value_checkers_.push_back(checker);
+    return *this;
+  }
+
+  void operator()(AttributeMap& attr_map) const {
+    if (!attr_map.count(attr_name_)) {
+      // user do not set this attr
+      PADDLE_ENFORCE(!default_value_setter_.empty(),
+                     "Attribute '%s' is required!", attr_name_);
+      // default_value_setter_ has no more than one element
+      T val;
+      (default_value_setter_[0])(val);
+      attr_map[attr_name_] = val;
+    }
+    Attribute& attr = attr_map.at(attr_name_);
+    ExtractAttribute<T> extract_attr(attr_name_);
+    T* attr_value = extract_attr(attr);
+    for (const auto& checker : value_checkers_) {
+      checker(*attr_value);
+    }
+  }
+
+ private:
+  std::string attr_name_;
+  std::vector<ValueChecker> value_checkers_;
+  std::vector<ValueChecker> default_value_setter_;
+};
+
+// check whether op's all attributes fit their own limits
+class OpAttrChecker {
+  typedef std::function<void(AttributeMap&)> AttrChecker;
+
+ public:
+  template <typename T>
+  TypedAttrChecker<T>& AddAttrChecker(const std::string& attr_name) {
+    attr_checkers_.push_back(TypedAttrChecker<T>(attr_name));
+    AttrChecker& checker = attr_checkers_.back();
+    return *(checker.target<TypedAttrChecker<T>>());
+  }
+
+  void Check(AttributeMap& attr_map) const {
+    for (const auto& checker : attr_checkers_) {
+      checker(attr_map);
+    }
+  }
+
+ private:
+  std::vector<AttrChecker> attr_checkers_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/backward.cc b/paddle/fluid/framework/backward.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c4795f4fc5c73034b23305162ea3b710480d8ebc
--- /dev/null
+++ b/paddle/fluid/framework/backward.cc
@@ -0,0 +1,585 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/backward.h"
+#include "paddle/fluid/operators/net_op.h"
+
+#include <deque>
+#include <list>
+#include <memory>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/net_op.h"
+
+namespace paddle {
+namespace framework {
+
+static std::unordered_set<std::string>* g_ctrl_flow_ops_ = nullptr;
+// Control Flow operators's backward is significantly different from
+// computational operators. Hack Code here.
+// We should design a better way to backward CtrlFlowOps.
+static std::unordered_set<std::string>& CtrlFlowOps() {
+  if (g_ctrl_flow_ops_ == nullptr) {
+    g_ctrl_flow_ops_ = new std::unordered_set<std::string>{
+        "increment", "lod_rank_table", "less_than"};
+  }
+  return *g_ctrl_flow_ops_;
+}
+
+static inline std::unique_ptr<OperatorBase> CreateGradOp(
+    const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
+    std::unordered_map<std::string, std::string>* grad_to_var) {
+  OpDesc op_desc;
+  op_desc.SetInputMap(op.Inputs());
+  op_desc.SetOutputMap(op.Outputs());
+  op_desc.SetType(op.Type());
+  op_desc.SetAttrMap(op.Attrs());
+  auto& info = OpInfoMap::Instance().Get(op.Type());
+  auto grad_descs = info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, {});
+  std::vector<std::unique_ptr<OperatorBase>> grad_ops;
+  grad_ops.reserve(grad_descs.size());
+  std::transform(grad_descs.begin(), grad_descs.end(),
+                 std::back_inserter(grad_ops),
+                 [](const std::unique_ptr<OpDesc>& grad_desc) {
+                   return OpRegistry::CreateOp(*grad_desc);
+                 });
+  PADDLE_ENFORCE(!grad_ops.empty());
+  if (grad_ops.size() == 1) {
+    return std::move(grad_ops[0]);
+  } else {
+    auto net_op = new operators::NetOp();
+    for (auto& grad_op : grad_ops) {
+      net_op->AppendOp(std::move(grad_op));
+    }
+    net_op->CompleteAddOp();
+    return std::unique_ptr<OperatorBase>(net_op);
+  }
+}
+
+template <typename Map, typename T>
+static void ForEachVarName(const Map& names, T callback) {
+  for (auto& name : names) {
+    for (auto& n : name.second) {
+      if (callback(n)) return;
+    }
+  }
+}
+
+// return whether all the names + suffixes in the set
+static bool AllInSet(
+    const std::map<std::string, std::vector<std::string>>& names,
+    const std::string& suffix, const std::unordered_set<std::string>& set) {
+  bool all_in_set = true;
+  ForEachVarName(names, [&all_in_set, &set, &suffix](const std::string& n) {
+    all_in_set = set.find(n + suffix) != set.end();
+    return !all_in_set;
+  });
+  return all_in_set;
+}
+
+static std::unique_ptr<OperatorBase> NOP() {
+  auto net_op = new operators::NetOp();
+  net_op->SetType("@NOP@");
+  net_op->CompleteAddOp();
+  return std::unique_ptr<OperatorBase>(net_op);
+}
+
+//  Get backward operator from a forward operator, a recursive implementation.
+//
+//  no_grad_names the gradient variable names without gradient calculating.
+//
+//  uniq_id is a unique index used inside recursively calling
+//  BackwardRecursive. use `uid = uniq_id++;` to get the unique index, and
+//  pass `uniq_id` through recursive calling.
+//
+//  returns The backward operator. In a simple situation, it may be a simple
+//  operator, in a complex situation, it maybe a NetOp.
+//
+//  See Backward.h for details
+static std::unique_ptr<OperatorBase> BackwardRecursive(
+    const OperatorBase& forwardOp,
+    std::unordered_set<std::string>& no_grad_names,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    size_t& uniq_id) {
+  //  If all input gradients of forwarding operator do not need to calculate,
+  //  just return an NOP. Not return null ptr because NOP does not take
+  //  too much time for calculation, but it is useful for simplifying logic.
+  if (AllInSet(forwardOp.Inputs() /*names*/, kGradVarSuffix /*suffix*/,
+               no_grad_names /*set*/)) {
+    return NOP();
+  }
+
+  //  All output gradients of forwarding operator do not need to calculate.
+  //  Then all input gradients cannot be computed at all, and we put them into
+  //  `no_grad_names` set. Return an NOP.
+  if (AllInSet(forwardOp.Outputs() /*names*/, kGradVarSuffix /*suffix*/,
+               no_grad_names /*set*/)) {
+    ForEachVarName(forwardOp.Inputs(),
+                   [&no_grad_names](const std::string& name) -> bool {
+                     no_grad_names.insert(GradVarName(name));
+                     return false;
+                   });
+    return NOP();
+  }
+
+  // Returned gradient network
+  auto net = std::unique_ptr<operators::NetOp>(new operators::NetOp());
+
+  if (forwardOp.IsNetOp()) {
+    // Because forwardOp is a net op, it can static_cast.
+    auto& forwardNet = static_cast<const operators::NetOp&>(forwardOp);
+
+    // Map from output gradient variable name to operator's indices in
+    // backward net's ops_. That operator generates that variable.
+    std::unordered_map<std::string, std::vector<size_t>> dup_output_ops;
+
+    size_t local_op_id = 0;
+    // reversely travel forwardNet and collect all duplicate outputs.
+    for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend();
+         ++it, ++local_op_id) {
+      auto& fwd = *it;
+      auto bwd = BackwardRecursive(*fwd, no_grad_names, grad_to_var, uniq_id);
+      ForEachVarName(bwd->Outputs(),
+                     [&dup_output_ops, local_op_id](const std::string& out) {
+                       dup_output_ops[out].emplace_back(local_op_id);
+                       return false;
+                     });
+      net->AppendOp(std::move(bwd));
+    }
+    // Get unique ID for this method.
+    auto uid = uniq_id++;
+    // TODO(dzh): more comment
+    // multiple operators which have the same output (y for example) may
+    // overwrite the same y variable when backward, special operations are token
+    // to handle this case. For each duplicate output, rename it to an alias
+    // (original name with a offset), append an `add` op for its operator,
+    // and finally sum all the alias variable to the final output variable y.
+    using Pos = std::pair<size_t, std::unique_ptr<OperatorBase>>;
+    std::list<Pos> insert_position;
+    for (auto& dup_output_op : dup_output_ops) {
+      const std::string& name = dup_output_op.first;
+      // duplicate @Empty@ don't need to be added
+      if (name == kEmptyVarName) continue;
+
+      auto& dup_op = dup_output_op.second;
+      // no duplicate output
+      if (dup_op.size() == 1) continue;
+
+      // process the duplicate outputs
+      std::vector<std::string> dup_outputs;
+      for (size_t i = 0; i < dup_op.size(); ++i) {
+        // rename each duplicate output to an alias
+        auto op_offset = dup_op[i];
+        dup_outputs.push_back(name + "@RENAME@" + std::to_string(uid) + "@" +
+                              std::to_string(i));
+        net->ops_[op_offset]->Rename(name, dup_outputs.back());
+      }
+      // collect all the offset for each alias,
+      // insert a sum operator to add all aliases to output
+      insert_position.push_back(
+          {dup_op.back(),
+           OpRegistry::CreateOp("sum", {{"X", dup_outputs}}, {{"Out", {name}}},
+                                AttributeMap{})});
+    }
+
+    // make sure the inserted `sum` ops follow the BFS order.
+    insert_position.sort(
+        [](const Pos& l, const Pos& r) { return l.first > r.first; });
+
+    for (auto& pos : insert_position) {
+      net->InsertOp(pos.first + 1, std::move(pos.second));
+    }
+  } else {
+    std::unique_ptr<OperatorBase> grad_op(
+        CreateGradOp(forwardOp, no_grad_names, grad_to_var));
+
+    ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op](
+                                          const std::string& grad_input) {
+      if (no_grad_names.count(grad_input)) {
+        // +1 for \0
+        std::string prefix = grad_input.substr(
+            0, grad_input.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
+        grad_op->Rename(grad_input, prefix + kZeroVarSuffix);
+
+        // If part of input gradient of that operator is not calculated, fill
+        // zero variables to that input gradient.
+        net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}},
+                                           {{"Out", {grad_input}}},
+                                           AttributeMap{}));
+      }
+      return false;
+    });
+
+    ForEachVarName(grad_op->Outputs(),
+                   [&no_grad_names, &grad_op](const std::string& grad_output) {
+                     if (no_grad_names.count(grad_output)) {
+                       grad_op->Rename(grad_output, kEmptyVarName);
+                     }
+                     return false;
+                   });
+
+    if (net->ops_.empty()) {  // Current no aux op is added to network
+      return grad_op;
+    }
+    net->AppendOp(std::move(grad_op));
+  }
+  net->SetType("@GENERATED_BACKWARD@");
+  net->CompleteAddOp();
+  return std::unique_ptr<OperatorBase>(
+      static_cast<OperatorBase*>(net.release()));
+}
+
+// See header for comments
+std::unique_ptr<OperatorBase> Backward(
+    const OperatorBase& forwardOp,
+    const std::unordered_set<std::string>& no_grad_vars) {
+  std::unordered_set<std::string> no_grad_names;
+  no_grad_names.reserve(no_grad_vars.size() + 1);
+
+  no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
+
+  for (auto& name : no_grad_vars) {
+    no_grad_names.insert(name + kGradVarSuffix);
+  }
+  size_t uid = 0;
+  std::unordered_map<std::string, std::string> grad_to_var;
+  return BackwardRecursive(forwardOp, no_grad_names, &grad_to_var, uid);
+}
+
+// ====================================  //
+
+static bool AllGradInSet(const std::vector<std::string>& names,
+                         const std::unordered_set<std::string>& set) {
+  for (const std::string& name : names) {
+    if (!set.count(GradVarName(name))) {
+      return false;
+    }
+  }
+  if (VLOG_IS_ON(10)) {
+    std::ostringstream sout;
+    sout << "All input {";
+    for (auto& name : names) {
+      sout << name << ",";
+    }
+    sout << "} is in {";
+    for (auto& name : set) {
+      sout << name << ",";
+    }
+    sout << "}";
+    VLOG(10) << sout.str();
+  }
+  return true;
+}
+
+static std::string FwdName(const std::string& grad_name) {
+  auto pos = grad_name.find("@GRAD");
+  if (pos == std::string::npos) {
+    return "";
+  } else {
+    return grad_name.substr(0, pos);
+  }
+}
+
+static void CreateGradVarInBlock(
+    size_t grad_op_start_index,
+    const std::unordered_map<std::string, std::string>& param_name_map,
+    BlockDesc* block_desc,
+    std::unordered_map<std::string, GradVarInfo>* grad_var_record) {
+  auto ops = block_desc->AllOps();
+  for (size_t op_index = grad_op_start_index; op_index < ops.size();
+       ++op_index) {
+    std::unordered_set<std::string> new_vars;
+    auto& ctrl_flow_ops = CtrlFlowOps();
+    ForEachVarName(ops[op_index]->Outputs(),
+                   [&](const std::string& grad_var_name) {
+                     if (ctrl_flow_ops.find(ops[op_index]->Type()) !=
+                         ctrl_flow_ops.end()) {
+                       if (block_desc->HasVarRecursive(grad_var_name)) {
+                         return false;
+                       }
+                     } else {
+                       if (block_desc->HasVar(grad_var_name)) {
+                         return false;
+                       }
+                     }
+                     if (grad_var_name == framework::kEmptyVarName) {
+                       return false;
+                     }
+                     auto var = block_desc->Var(grad_var_name);
+                     VLOG(10) << "Creating Variable " << grad_var_name;
+                     new_vars.insert(var->Name());
+                     auto it = param_name_map.find(grad_var_name);
+                     if (it == param_name_map.end()) {
+                       return false;
+                     }
+                     auto param_var_name = it->second;
+                     auto& grad_record = (*grad_var_record)[param_var_name];
+                     grad_record.name_ = grad_var_name;
+                     grad_record.block_idx_ = block_desc->ID();
+                     grad_record.op_idx_ = static_cast<int>(op_index);
+                     return false; /* not break */
+                   });
+    ops[op_index]->InferVarType(block_desc);
+    for (auto& arg : ops[op_index]->OutputArgumentNames()) {
+      if (new_vars.find(arg) == new_vars.end()) {
+        continue;
+      }
+      auto pname = FwdName(arg);
+      auto* param = block_desc->FindVarRecursive(pname);
+      auto* grad = block_desc->FindVar(arg);
+      if (param == nullptr) {
+        grad->SetDataType(proto::DataType::FP32);
+      } else {
+        grad->SetDataType(param->GetDataType());
+      }
+    }
+    ops[op_index]->InferShape(*block_desc);
+  }
+}
+
+std::vector<std::unique_ptr<OpDesc>> MakeOpGrad(
+    const OpDesc* op_desc, std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>()) {
+  std::vector<std::unique_ptr<OpDesc>> grad_op_descs;
+  // All input gradients of forwarding operator do not need to calculate.
+  const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
+  if (AllGradInSet(inputs, *no_grad_vars)) {
+    VLOG(10) << "Drop operator  " << op_desc->Type();
+    return grad_op_descs;  // empty vector
+  }
+
+  // All output gradients of forwarding operator do not need to calculate.
+  const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
+
+  if (AllGradInSet(outputs, *no_grad_vars)) {
+    VLOG(10) << "Drop operator " << op_desc->Type();
+    // FIXME: Hack code here
+    auto& ctrl_flow_ops = CtrlFlowOps();
+    if (ctrl_flow_ops.find(op_desc->Type()) == ctrl_flow_ops.end()) {
+      // Only computational op need drop input's gradient.
+      for (const std::string& name : inputs) {
+        no_grad_vars->insert(GradVarName(name));
+        VLOG(10) << " Also drop " << GradVarName(name);
+      }
+    }
+
+    return grad_op_descs;  // empty vector
+  }
+
+  grad_op_descs =
+      OpInfoMap::Instance()
+          .Get(op_desc->Type())
+          .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var, grad_block);
+
+  std::list<std::unique_ptr<OpDesc>> pending_fill_zeros_ops;
+  for (auto& desc : grad_op_descs) {
+    for (const std::string& in_name : desc->InputArgumentNames()) {
+      if (no_grad_vars->count(in_name)) {
+        std::string prefix = in_name.substr(
+            0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
+        std::string new_name = prefix + kZeroVarSuffix;
+        desc->Rename(in_name, new_name);
+        std::unique_ptr<OpDesc> fill_zeros_op(
+            new OpDesc("fill_zeros_like", {{"X", {prefix}}},
+                       {{"Out", {new_name}}}, AttributeMap{}));
+        pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
+      }
+    }
+  }
+
+  for (auto& p : pending_fill_zeros_ops) {
+    grad_op_descs.insert(grad_op_descs.begin(), std::move(p));
+  }
+  return grad_op_descs;
+}
+
+static BlockDesc* CreateStepBlock(
+    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    int step_block_idx);
+
+std::vector<std::unique_ptr<OpDesc>> MakeBlockBackward(
+    ProgramDesc& program_desc, int block_idx,
+    std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var) {
+  VLOG(5) << "MakeBlockBackward";
+  BlockDesc* cur_block = program_desc.MutableBlock(block_idx);
+  std::vector<OpDesc*> op_descs = cur_block->AllOps();
+  std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
+  size_t grad_desc_idx = 0;
+  std::vector<std::unique_ptr<OpDesc>> backward_descs;
+
+  for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
+    VLOG(5) << "Making backward " << (*it)->Type() << " op";
+    std::vector<std::unique_ptr<OpDesc>> op_grads;
+
+    if ((*it)->Type() == "recurrent" || (*it)->Type() == "while" ||
+        (*it)->Type() == "parallel_do") {
+      int step_block_idx = (*it)->GetBlockAttr("sub_block");
+      BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars,
+                                                  grad_to_var, step_block_idx);
+      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
+    } else if ((*it)->Type() == "conditional_block") {
+      BlockDesc* backward_block =
+          CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
+                          (*it)->GetBlockAttr("sub_block"));
+      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
+    } else {
+      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
+    }
+
+    if (VLOG_IS_ON(10)) {
+      std::ostringstream sout;
+      sout << "Made ";
+      for (auto& op_grad : op_grads) {
+        sout << op_grad->Type() << " ";
+      }
+      VLOG(10) << sout.str();
+    }
+
+    for (const auto& desc : op_grads) {
+      for (const std::string& out_name : desc->OutputArgumentNames()) {
+        if (out_name.find("@GRAD") == std::string::npos) {
+          // Not all outputs of a backward operator is a gradient. Only gradient
+          // need to be sum. Skip variables are not gradient.
+          continue;
+        }
+        dup_out_ops[out_name].emplace_back(grad_desc_idx);
+      }
+      ++grad_desc_idx;
+    }
+    std::transform(op_grads.begin(), op_grads.end(),
+                   std::back_inserter(backward_descs),
+                   [](std::unique_ptr<OpDesc>& ptr) { return std::move(ptr); });
+  }
+
+  VLOG(5) << "Appending Sums";
+  // Check whether some variables are written more than once
+  std::list<std::pair<size_t, std::unique_ptr<OpDesc>>> pending_sum_ops;
+  for (const auto& dup : dup_out_ops) {
+    const std::string& out_name = dup.first;
+    const std::vector<size_t> dup_op = dup.second;
+    if (out_name != kEmptyVarName && dup_op.size() > 1) {
+      std::vector<std::string> sum_op_inputs;
+      std::string next_g_name = out_name;
+      for (size_t i = 0; i < dup_op.size(); ++i) {
+        VLOG(10) << backward_descs[dup_op[i]]->Type() << " has " << out_name
+                 << " duplicated";
+        std::string new_name = out_name + "@RENAME@" + std::to_string(i);
+        backward_descs[dup_op[i]]->RenameOutput(out_name, new_name);
+        backward_descs[dup_op[i]]->RenameInput(out_name, next_g_name);
+        sum_op_inputs.emplace_back(new_name);
+        next_g_name = sum_op_inputs.back();
+      }
+      std::unique_ptr<OpDesc> sum_op(new OpDesc("sum", {{"X", sum_op_inputs}},
+                                                {{"Out", {out_name}}},
+                                                AttributeMap{}));
+      pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
+    }
+  }
+
+  pending_sum_ops.sort([](const std::pair<size_t, std::unique_ptr<OpDesc>>& a,
+                          const std::pair<size_t, std::unique_ptr<OpDesc>>& b) {
+    return a.first > b.first;
+  });
+  for (auto& p : pending_sum_ops) {
+    backward_descs.insert(backward_descs.begin() + p.first + 1,
+                          std::move(p.second));
+  }
+
+  VLOG(5) << "MakeBlockBackward Finished";
+
+  return backward_descs;
+}
+
+static BlockDesc* CreateStepBlock(
+    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    int step_block_idx) {
+  auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
+                                                   no_grad_vars, grad_to_var);
+  BlockDesc* backward_block =
+      program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
+  for (auto& ptr : backward_block_op_descs) {
+    backward_block->AppendAllocatedOp(move(ptr));
+  }
+  return backward_block;
+}
+
+ParamGradInfoMap AppendBackward(
+    ProgramDesc& program_desc, const VarDesc& target,
+    const std::unordered_set<std::string>& no_grad_vars) {
+  std::unordered_set<std::string> no_grad_var_names;
+  no_grad_var_names.reserve(no_grad_vars.size() + 1);
+  no_grad_var_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
+  for (auto& name : no_grad_vars) {
+    no_grad_var_names.insert(GradVarName(name));
+  }
+
+  const int root_block_idx = 0;
+  auto root_block = program_desc.MutableBlock(root_block_idx);
+
+  std::string fill_one_op_out = GradVarName(target.Name());
+  bool is_scalar = target.GetShape() == std::vector<int64_t>{1};
+  PADDLE_ENFORCE(is_scalar, "target should be scalar");
+  VLOG(3) << "backward from loss=" << target.Name()
+          << " data_type=" << target.GetDataType();
+  std::unique_ptr<OpDesc> fill_one_op(
+      new OpDesc("fill_constant", {}, {{"Out", {fill_one_op_out}}},
+                 {{"shape", std::vector<int>{1}},
+                  {"value", static_cast<float>(1.0)},
+                  {"dtype", target.GetDataType()}}));
+  // infer var type of fill_one_op
+  fill_one_op->InferVarType(root_block);
+
+  root_block->AppendAllocatedOp(std::move(fill_one_op));
+  size_t forward_op_num = root_block->OpSize();
+  size_t forward_block_num = program_desc.Size();
+
+  // Insert backward operators
+  std::unordered_map<std::string, std::string> grad_to_var;
+  auto backward_op_descs = MakeBlockBackward(program_desc, root_block_idx,
+                                             &no_grad_var_names, &grad_to_var);
+
+  for (auto& ptr : backward_op_descs) {
+    root_block->AppendAllocatedOp(std::move(ptr));
+  }
+  // Create Variable
+
+  // Create target gradient variable
+  std::unordered_map<std::string, GradVarInfo> retv;
+
+  auto var = root_block->Var(fill_one_op_out);
+  var->SetDataType(target.GetDataType());
+  var->SetShape(target.GetShape());
+  auto& target_grad = retv[target.Name()];
+  target_grad.name_ = fill_one_op_out;
+  target_grad.block_idx_ = root_block_idx;
+  target_grad.op_idx_ = static_cast<int>(forward_op_num);
+
+  // create grad_var for all blocks in this program
+  CreateGradVarInBlock(forward_op_num, grad_to_var, root_block, &retv);
+  for (size_t block_index = forward_block_num;
+       block_index < program_desc.Size(); ++block_index) {
+    CreateGradVarInBlock(0, grad_to_var, program_desc.MutableBlock(block_index),
+                         &retv);
+  }
+  return retv;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/backward.h b/paddle/fluid/framework/backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ea6922426e1dad0ca9b6e1287701bca0adef5c8
--- /dev/null
+++ b/paddle/fluid/framework/backward.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+
+// Create the backward operator from a forward operator.
+// TODO(yuyang18): Add more API reference comment.
+extern std::unique_ptr<OperatorBase> Backward(
+    const OperatorBase& forwardOp,
+    const std::unordered_set<std::string>& no_grad_vars);
+
+struct GradVarInfo {
+  GradVarInfo() {}
+  GradVarInfo(const std::string& name, int block_idx, int op_idx)
+      : name_(name), block_idx_(block_idx), op_idx_(op_idx) {}
+
+  bool operator==(const GradVarInfo& b) const {
+    return name_ == b.name_ && block_idx_ == b.block_idx_ &&
+           op_idx_ == b.op_idx_;
+  }
+
+  std::string name_;
+  int block_idx_;
+  int op_idx_;
+};
+
+using ParamGradInfoMap = std::unordered_map<std::string /*fwd_var_name*/,
+                                            GradVarInfo /*grad_var_info*/>;
+
+ParamGradInfoMap AppendBackward(
+    ProgramDesc& program_desc, const VarDesc& target,
+    const std::unordered_set<std::string>& no_grad_vars);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/backward_test.cc b/paddle/fluid/framework/backward_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f9604c68913f98abc4d52c84bc8fa2c02e1a6a31
--- /dev/null
+++ b/paddle/fluid/framework/backward_test.cc
@@ -0,0 +1,918 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/backward.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/operators/net_op.h"
+
+USE_NO_KERNEL_OP(fill_constant);
+
+namespace paddle {
+namespace framework {
+
+using DeviceContext = platform::DeviceContext;
+
+class NoneOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {}
+};
+
+template <typename Place, typename T>
+class NoneKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {}
+};
+
+class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input X of Add");
+    AddInput("b", "Bias of Add");
+    AddOutput("Out", "Out of Add");
+    AddComment("Add Op");
+  }
+};
+
+class RowWiseAddGradMaker : public SingleGradOpDescMaker {
+ public:
+  using SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<OpDesc> Apply() const override {
+    auto grad_op = new OpDesc();
+    grad_op->SetInput(GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(GradVarName("X"), InputGrad("X"));
+    grad_op->SetOutput(GradVarName("b"), InputGrad("b"));
+    grad_op->SetType("rowwise_add_grad");
+    return std::unique_ptr<OpDesc>(grad_op);
+  }
+};
+
+class MulOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "A");
+    AddInput("Y", "B");
+    AddOutput("Out", "Out");
+    AddAttr<int>("x_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
+    AddAttr<int>("y_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
+    AddComment("Mul");
+  }
+};
+
+class SigmoidOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "X");
+    AddOutput("Out", "Y");
+    AddComment("Sigmoid");
+  }
+};
+
+class NoGradOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  NoGradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "X input");
+    AddOutput("Out", "Y output");
+    AddComment("NoGradOp, same input output. no Grad");
+  }
+};
+
+class FcOp : public operators::NetOp {
+ public:
+  FcOp(const std::string &type, const VariableNameMap &inputs,
+       const VariableNameMap &outputs, const AttributeMap &attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    AppendOp(OpRegistry::CreateOp(
+        "mul", {{"X", {Input("X")}}, {"Y", {Input("W")}}},
+        {{"Out", {Output("mul_result")}}}, AttributeMap{}));
+    auto input_b = Inputs("b");
+    std::string before_act = "mul_result";
+    if (input_b.size() != 0) {
+      AppendOp(OpRegistry::CreateOp(
+          "rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}},
+          {{"Out", {Output("add_result")}}}, AttributeMap{}));
+      before_act = "add_result";
+    } else {
+      auto out_varname = Output("add_result");
+      if (out_varname != kEmptyVarName) {
+        this->Rename(out_varname, kEmptyVarName);
+      }
+    }
+
+    AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
+                                  {{"Out", {Output("Out")}}}, AttributeMap{}));
+    CompleteAddOp(false);
+  }
+};
+
+class FcOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  FcOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "x");
+    AddInput("W", "w");
+    AddInput("b", "b");
+    AddOutput("mul_result", "").AsIntermediate();
+    AddOutput("add_result", "").AsIntermediate();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class ManyOutputOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  ManyOutputOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("x", "x");
+    AddOutput("y", "y");
+    AddOutput("z", "z");
+    AddComment("");
+  }
+};
+
+class FillZeroOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "x");
+    AddOutput("Out", "out");
+    AddComment("");
+  }
+};
+
+class SumOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "the input tensors of sum operator.").AsDuplicable();
+    AddOutput("Out", "the output tensor of sum operator.");
+    AddComment("");
+  }
+};
+
+class MultInOutOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  MultInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "x");
+    AddInput("H", "h");
+    AddOutput("Y", "y");
+    AddOutput("Z", "z");
+    AddComment("");
+  }
+};
+
+class MinusGradOpDescMaker : public GradOpDescMakerBase {
+ public:
+  using GradOpDescMakerBase::GradOpDescMakerBase;
+
+  std::vector<std::unique_ptr<OpDesc>> operator()() const override {
+    std::vector<std::unique_ptr<OpDesc>> retv;
+    auto x_g = InputGrad("X");
+    if (!x_g.empty()) {
+      auto *op_desc = new OpDesc();
+      op_desc->SetType("scale");
+      op_desc->SetInput("X", OutputGrad("Out"));
+      op_desc->SetOutput("Out", x_g);
+      op_desc->SetAttr("scale", 1.0f);
+      retv.emplace_back(op_desc);
+    }
+
+    auto y_g = InputGrad("Y");
+    if (!y_g.empty()) {
+      auto *op_desc = new OpDesc();
+      op_desc->SetType("scale");
+      op_desc->SetInput("X", OutputGrad("Out"));
+      op_desc->SetOutput("Out", y_g);
+      op_desc->SetAttr("scale", -1.0f);
+      retv.emplace_back(op_desc);
+    }
+    return retv;
+  }
+};
+
+class MinusOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "");
+    AddInput("Y", "");
+    AddOutput("Out", "");
+    AddComment("minus for unittest");
+  }
+};
+}  // namespace framework
+}  // namespace paddle
+
+namespace f = paddle::framework;
+namespace ops = paddle::operators;
+using EnforceNotMet = paddle::platform::EnforceNotMet;
+// rowwise_add
+REGISTER_OPERATOR(rowwise_add, f::NoneOp, f::RowWiseAddOpMaker,
+                  f::RowWiseAddGradMaker);
+REGISTER_OP_CPU_KERNEL(rowwise_add,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OPERATOR(rowwise_add_grad, f::NoneOp);
+REGISTER_OP_CPU_KERNEL(rowwise_add_grad,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+// mul
+REGISTER_OP(mul, f::NoneOp, f::MulOpMaker, mul_grad, f::NoneOp);
+REGISTER_OP_CPU_KERNEL(mul, f::NoneKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mul_grad,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+// sigmoid
+REGISTER_OP(sigmoid, f::NoneOp, f::SigmoidOpMaker, sigmoid_grad, f::NoneOp);
+REGISTER_OP_CPU_KERNEL(sigmoid,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NoneOp, f::NoGradOpMaker);
+// fill_zeros_like
+REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NoneOp, f::FillZeroOpMaker);
+REGISTER_OP_CPU_KERNEL(fill_zeros_like,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+// sum
+REGISTER_OP(sum, f::NoneOp, f::SumOpMaker, sum_grad, f::NoneOp);
+REGISTER_OP_CPU_KERNEL(sum, f::NoneKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(sum_grad,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+// fc
+REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
+// many_output_op
+REGISTER_OP(many_output_op, f::NoneOp, f::ManyOutputOpMaker,
+            many_output_op_grad, f::NoneOp);
+// mult_in_out
+REGISTER_OP(mult_in_out, f::NoneOp, f::MultInOutOpMaker, mult_in_out_grad,
+            f::NoneOp);
+REGISTER_OP_CPU_KERNEL(mult_in_out,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mult_in_out_grad,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+// minus
+REGISTER_OPERATOR(minus, f::NoneOp, f::MinusOpMaker, f::MinusGradOpDescMaker);
+REGISTER_OP_CPU_KERNEL(minus, f::NoneKernel<paddle::platform::CPUPlace, float>);
+// scale
+REGISTER_OPERATOR(scale, f::NoneOp);
+REGISTER_OP_CPU_KERNEL(scale, f::NoneKernel<paddle::platform::CPUPlace, float>);
+
+TEST(Backward, simple_op_not_need_grad) {
+  auto fwd =
+      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
+                              {{"Out", {"out"}}}, f::AttributeMap{});
+  ASSERT_NE(fwd, nullptr);
+  auto gop = f::Backward(*fwd, {"x"});
+  ASSERT_EQ(gop->Output(f::GradVarName("X")), f::kEmptyVarName);
+
+  auto no_input_gop = f::Backward(*fwd, {"x", "b"});
+  ASSERT_NE(no_input_gop, nullptr);
+  ASSERT_TRUE(no_input_gop->IsNetOp());
+  ASSERT_EQ(0UL, static_cast<ops::NetOp *>(no_input_gop.get())->ops_.size());
+}
+
+TEST(Backward, net_fc_backward_normal) {
+  std::shared_ptr<f::OperatorBase> fwd =
+      f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {"b"}}},
+                              {{"mul_result", {"mul_res"}},
+                               {"add_result", {"add_re"}},
+                               {"Out", {"out"}}},
+                              f::AttributeMap{});
+  ASSERT_NE(fwd, nullptr);
+  std::shared_ptr<f::OperatorBase> gop =
+      f::Backward(*fwd, std::unordered_set<std::string>{});
+  ASSERT_TRUE(gop->IsNetOp());
+  auto net = static_cast<ops::NetOp *>(gop.get());
+
+  ASSERT_NO_THROW(net->DebugString());
+
+  ASSERT_EQ(3UL, net->ops_.size());
+
+  f::OperatorBase &d_sigmoid = *net->ops_[0];
+  ASSERT_EQ("sigmoid_grad", d_sigmoid.Type());
+
+  f::OperatorBase &d_add = *net->ops_[1];
+  ASSERT_EQ("rowwise_add_grad", d_add.Type());
+
+  f::OperatorBase &d_mul = *net->ops_[2];
+  ASSERT_EQ("mul_grad", d_mul.Type());
+}
+
+TEST(Backward, net_fc_backward_not_have_b) {
+  std::shared_ptr<f::OperatorBase> fwd =
+      f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {}}},
+                              {{"mul_result", {"mul_res"}},
+                               {"add_result", {"add_res"}},
+                               {"Out", {"tmp"}}},
+                              f::AttributeMap{});
+  ASSERT_NE(fwd, nullptr);
+  std::shared_ptr<f::OperatorBase> gop =
+      f::Backward(*fwd, std::unordered_set<std::string>{});
+  ASSERT_TRUE(gop->IsNetOp());
+  auto net = static_cast<ops::NetOp *>(gop.get());
+
+  ASSERT_NO_THROW(net->DebugString());
+
+  ASSERT_EQ(2UL, net->ops_.size());
+
+  f::OperatorBase &d_sigmoid = *net->ops_[0];
+  ASSERT_EQ("sigmoid_grad", d_sigmoid.Type());
+
+  f::OperatorBase &d_mul = *net->ops_[1];
+  ASSERT_EQ("mul_grad", d_mul.Type());
+}
+
+TEST(Backward, net_input_of_network_not_need_grad) {
+  ops::NetOp net;
+  net.AppendOp(f::OpRegistry::CreateOp(
+      "fc", {{"X", {"x"}}, {"W", {"W1"}}, {"b", {"b1"}}},
+      {{"mul_result", {"mul_tmp_0"}},
+       {"add_result", {"add_tmp_0"}},
+       {"Out", {"hidden0"}}},
+      f::AttributeMap{}));
+  net.AppendOp(f::OpRegistry::CreateOp(
+      "fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}},
+      {{"mul_result", {"mul_tmp_1"}},
+       {"add_result", {"add_tmp_1"}},
+       {"Out", {"hidden1"}}},
+      f::AttributeMap{}));
+  net.CompleteAddOp();
+  auto bwd = Backward(net, {"x"});  // x@GRAD is not need.
+  ASSERT_TRUE(bwd->IsNetOp());
+  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
+
+  auto output_vars = bwd_net->OutputVars(true);
+  std::unordered_set<std::string> all_outputs =
+      std::unordered_set<std::string>(output_vars.begin(), output_vars.end());
+  all_outputs.erase(f::kEmptyVarName);
+
+  for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) {
+    ASSERT_NE(all_outputs.find(f::GradVarName(out)), all_outputs.end());
+  }
+
+  // Not Generated X
+  ASSERT_EQ(all_outputs.find(f::GradVarName("X")), all_outputs.end());
+
+  ASSERT_EQ(2UL, bwd_net->ops_.size());
+  ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
+  auto first_fc_grad = static_cast<ops::NetOp *>(bwd_net->ops_[1].get());
+  ASSERT_EQ(3UL, first_fc_grad->ops_.size());
+  ASSERT_EQ(f::kEmptyVarName,
+            first_fc_grad->ops_[2]->Output(f::GradVarName("X")));
+}
+
+TEST(Backward, net_shared_weight) {
+  ops::NetOp net;
+  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
+                                       {{"Out", {"out"}}}, f::AttributeMap{}));
+  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
+                                       {{"Out", {"FinalOut"}}},
+                                       f::AttributeMap{}));
+  net.CompleteAddOp();
+
+  auto bwd = f::Backward(net, std::unordered_set<std::string>{});
+  ASSERT_TRUE(bwd->IsNetOp());
+  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
+  ASSERT_EQ(3UL, bwd_net->ops_.size());
+  ASSERT_EQ("sum", bwd_net->ops_[2]->Type());
+}
+
+TEST(Backward, op_all_input_are_not_need) {
+  auto fwd =
+      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
+                              {{"Out", {"out"}}}, f::AttributeMap{});
+  auto backward = f::Backward(*fwd, {"x", "b"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto net = static_cast<ops::NetOp *>(backward.get());
+  ASSERT_TRUE(net->ops_.empty());
+}
+
+TEST(Backward, op_all_output_are_not_need) {
+  auto fwd =
+      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
+                              {{"Out", {"out"}}}, f::AttributeMap{});
+  auto backward = f::Backward(*fwd, {"out"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto net = static_cast<ops::NetOp *>(backward.get());
+  ASSERT_TRUE(net->ops_.empty());
+}
+
+TEST(Backward, op_part_of_output_are_not_need) {
+  auto fwd =
+      f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}},
+                              {{"y", {"Y"}}, {"z", {"Z"}}}, f::AttributeMap{});
+  auto backward = f::Backward(*fwd, {"Z"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto net = static_cast<ops::NetOp *>(backward.get());
+  ASSERT_EQ(net->ops_.size(), 2UL);
+
+  auto &fill_zero = *net->ops_[0];
+  ASSERT_EQ("fill_zeros_like", fill_zero.Type());
+  ASSERT_EQ(1UL, fill_zero.Inputs("X").size());
+  ASSERT_EQ("Z", fill_zero.Input("X"));
+  ASSERT_EQ(1UL, fill_zero.Outputs("Out").size());
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Out"));
+
+  auto &d_many_out = *net->ops_[1];
+  ASSERT_EQ("many_output_op_grad", d_many_out.Type());
+  ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.Inputs().size());  // I/O/OG
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix,
+            d_many_out.Input(f::GradVarName("z")));
+  ASSERT_EQ(f::GradVarName("Y"), d_many_out.Input(f::GradVarName("y")));
+  ASSERT_EQ(f::GradVarName("X"), d_many_out.Output(f::GradVarName("x")));
+}
+
+TEST(Backward, op_part_of_input_are_not_need) {
+  auto fwd = f::OpRegistry::CreateOp("mul", {{"X", {"a"}}, {"Y", {"b"}}},
+                                     {{"Out", {"out"}}}, f::AttributeMap{});
+  auto backward = f::Backward(*fwd, {"a"});
+  auto &grad_mul = *backward;
+  ASSERT_EQ(grad_mul.Type(), "mul_grad");
+  ASSERT_EQ(grad_mul.Inputs().size(), 2UL + 1UL + 1UL);
+  ASSERT_EQ(grad_mul.Outputs().size(), 2UL);
+  ASSERT_EQ(grad_mul.Output(f::GradVarName("X")), f::kEmptyVarName);
+  ASSERT_EQ(grad_mul.Output(f::GradVarName("Y")), f::GradVarName("b"));
+  ASSERT_EQ(grad_mul.Input(f::GradVarName("Out")), f::GradVarName("out"));
+  ASSERT_EQ(grad_mul.Input("X"), "a");
+  ASSERT_EQ(grad_mul.Input("Y"), "b");
+  ASSERT_EQ(grad_mul.Input("Out"), "out");
+}
+
+TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
+  ops::NetOp net;
+  net.AppendOp(f::OpRegistry::CreateOp(
+      "fc", {{"X", {"x1"}}, {"W", {"w1"}}, {"b", {"b1"}}},
+      {{"mul_result", {"mul_out1"}},
+       {"add_result", {"add_out1"}},
+       {"Out", {"out1"}}},
+      f::AttributeMap{}));
+  net.AppendOp(f::OpRegistry::CreateOp(
+      "fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}},
+      {{"mul_result", {"mul_out2"}},
+       {"add_result", {"tmp_out2"}},
+       {"Out", {"out2"}}},
+      f::AttributeMap{}));
+  net.AppendOp(f::OpRegistry::CreateOp(
+      "fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}},
+      {{"mul_result", {"mul_out3"}},
+       {"add_result", {"tmp_out3"}},
+       {"Out", {"out3"}}},
+      f::AttributeMap{}));
+  net.CompleteAddOp();
+
+  auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto bwd_net = static_cast<ops::NetOp *>(backward.get());
+  ASSERT_EQ(bwd_net->ops_.size(), 3UL);
+  auto &grad_fc = *bwd_net->ops_[0];
+
+  const char *all = paddle::operators::NetOp::kAll;
+  EXPECT_EQ(grad_fc.Inputs(all).size(),
+            2UL       /* external input number */
+                + 1UL /* external output number*/
+                + 1UL /* number of gradient of external output*/
+                + 2UL /* internal variable number*/
+            );
+  EXPECT_EQ(grad_fc.Outputs(all).size(),
+            2UL       /* input number of mul*/
+                + 2UL /* input number of rowwise_add*/
+                + 1UL /* input number of sigmod */
+                - 1UL /* out2 is not needed*/);
+  EXPECT_EQ(bwd_net->ops_[1]->Inputs(all).size(), 0UL);
+  EXPECT_EQ(bwd_net->ops_[1]->Outputs(all).size(), 0UL);
+  EXPECT_EQ(bwd_net->ops_[2]->Inputs(all).size(), 0UL);
+  EXPECT_EQ(bwd_net->ops_[2]->Outputs(all).size(), 0UL);
+}
+
+TEST(Backward, simple_single_op) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+
+  f::OpDesc *op = block->AppendOp();
+  op->SetType("rowwise_add");
+  op->SetInput("X", {"x"});
+  op->SetInput("b", {"b"});
+  op->SetOutput("Out", {"out"});
+
+  auto target = f::VarDesc("out");
+  target.SetShape({1});
+  auto var_to_grad =
+      AppendBackward(program, target, std::unordered_set<std::string>{});
+
+  ASSERT_EQ(block->AllOps().size(), 3UL);
+  f::OpDesc *fill_op = block->AllOps()[1];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDesc *grad_op = block->AllOps()[2];
+  EXPECT_EQ(grad_op->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out")}));
+  EXPECT_EQ(grad_op->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x")}));
+  EXPECT_EQ(grad_op->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b")}));
+
+  EXPECT_EQ(var_to_grad.size(), 3UL);
+  EXPECT_EQ(var_to_grad.at("b"), f::GradVarInfo(f::GradVarName("b"), 0, 2));
+  EXPECT_EQ(var_to_grad.at("x"), f::GradVarInfo(f::GradVarName("x"), 0, 2));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x")));
+}
+
+TEST(Backward, default_attribute) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op = block->AppendOp();
+  op->SetType("mul");
+  op->SetInput("X", {"x"});
+  op->SetInput("Y", {"y"});
+  op->SetOutput("Out", {"out"});
+  op->CheckAttrs();
+
+  auto target = f::VarDesc("out");
+  target.SetShape({1});
+  AppendBackward(program, target, std::unordered_set<std::string>{});
+
+  ASSERT_EQ(block->AllOps().size(), 3UL);
+  EXPECT_EQ(boost::get<int>(op->GetAttr("x_num_col_dims")), 1);
+  EXPECT_EQ(boost::get<int>(op->GetAttr("y_num_col_dims")), 1);
+
+  f::OpDesc *fill_op = block->AllOps()[1];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDesc *grad_op = block->AllOps()[2];
+  ASSERT_EQ(grad_op->Type(), "mul_grad");
+  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("x_num_col_dims")), 1);
+  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("y_num_col_dims")), 1);
+}
+
+TEST(Backward, simple_mult_op) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
+  op1->SetType("rowwise_add");
+  op1->SetInput("X", {"x1"});
+  op1->SetInput("b", {"b1"});
+  op1->SetOutput("Out", {"out1"});
+
+  f::OpDesc *op2 = block->AppendOp();
+  op2->SetType("mul");
+  op2->SetInput("X", {"out1"});
+  op2->SetInput("Y", {"y2"});
+  op2->SetOutput("Out", {"out2"});
+
+  f::OpDesc *op3 = block->AppendOp();
+  op3->SetType("rowwise_add");
+  op3->SetInput("X", {"out2"});
+  op3->SetInput("b", {"b3"});
+  op3->SetOutput("Out", {"out3"});
+
+  auto target = f::VarDesc("out3");
+  target.SetShape({1});
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad =
+      AppendBackward(program, target, std::unordered_set<std::string>{});
+
+  ASSERT_EQ(block->AllOps().size(), 6UL + 1);
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDesc *grad_op1 = block->AllOps()[6];
+  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b1")}));
+
+  f::OpDesc *grad_op2 = block->AllOps()[5];
+  EXPECT_EQ(grad_op2->Type(), "mul_grad");
+  ASSERT_EQ(grad_op2->InputNames().size(), 4UL);
+  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"out1"}));
+  EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
+  EXPECT_EQ(grad_op2->Input("Out"), std::vector<std::string>({"out2"}));
+  EXPECT_EQ(grad_op2->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out2")}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")),
+            std::vector<std::string>({f::GradVarName("y2")}));
+
+  f::OpDesc *grad_op3 = block->AllOps()[4];
+  EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out3")}));
+  EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out2")}));
+  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b3")}));
+
+  EXPECT_EQ(var_to_grad.size(), 7UL);
+  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("out1"),
+            f::GradVarInfo(f::GradVarName("out1"), 0, 5));
+  EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
+  EXPECT_EQ(var_to_grad.at("out2"),
+            f::GradVarInfo(f::GradVarName("out2"), 0, 4));
+  EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("out2")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
+}
+
+TEST(Backward, intermedia_var_no_grad) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
+  op1->SetType("rowwise_add");
+  op1->SetInput("X", {"x1"});
+  op1->SetInput("b", {"b1"});
+  op1->SetOutput("Out", {"out1"});
+
+  f::OpDesc *op2 = block->AppendOp();
+  op2->SetType("mul");
+  op2->SetInput("X", {"x2"});
+  op2->SetInput("Y", {"y2"});
+  op2->SetOutput("Out", {"out2"});
+
+  f::OpDesc *op3 = block->AppendOp();
+  op3->SetType("rowwise_add");
+  op3->SetInput("X", {"out2"});
+  op3->SetInput("b", {"b3"});
+  op3->SetOutput("Out", {"out3"});
+
+  f::OpDesc *op4 = block->AppendOp();
+  op4->SetType("mul");
+  op4->SetInput("X", {"out1"});
+  op4->SetInput("Y", {"out3"});
+  op4->SetOutput("Out", {"out4"});
+
+  auto target = f::VarDesc("out4");
+  target.SetShape({1});
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {"out3"});
+
+  ASSERT_EQ(block->AllOps().size(), 7UL);
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDesc *grad_op1 = block->AllOps()[6];
+  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b1")}));
+
+  f::OpDesc *grad_op4 = block->AllOps()[5];
+  EXPECT_EQ(grad_op4->Type(), "mul_grad");
+  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
+  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
+  EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"out3"}));
+  EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out4"}));
+  EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out4")}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), std::vector<std::string>());
+
+  EXPECT_EQ(var_to_grad.size(), 4UL);
+  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("out1"),
+            f::GradVarInfo(f::GradVarName("out1"), 0, 5));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
+}
+
+TEST(Backward, var_no_grad) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
+  op1->SetType("mult_in_out");
+  op1->SetInput("X", {"x1"});
+  op1->SetInput("H", {"h1"});
+  op1->SetOutput("Y", {"y1"});
+  op1->SetOutput("Z", {"z1"});
+
+  f::OpDesc *op2 = block->AppendOp();
+  op2->SetType("mult_in_out");
+  op2->SetInput("X", {"y1"});
+  op2->SetInput("H", {"z1"});
+  op2->SetOutput("Y", {"y2"});
+  op2->SetOutput("Z", {"z2"});
+
+  auto target = f::VarDesc("z2");
+  target.SetShape({1});
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {"z1"});
+
+  ASSERT_EQ(block->AllOps().size(), 6UL);
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDesc *grad_op2 = block->AllOps()[3];
+  ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad");
+  ASSERT_EQ(grad_op2->InputNames().size(), 6UL);
+  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"y1"}));
+  EXPECT_EQ(grad_op2->Input("H"), std::vector<std::string>({"z1"}));
+  EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
+  EXPECT_EQ(grad_op2->Input("Z"), std::vector<std::string>({"z2"}));
+  EXPECT_EQ(grad_op2->Input(f::GradVarName("Y")),
+            std::vector<std::string>({f::GradVarName("y2")}));
+  EXPECT_EQ(grad_op2->Input(f::GradVarName("Z")),
+            std::vector<std::string>({f::GradVarName("z2")}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("y1")}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), std::vector<std::string>());
+
+  f::OpDesc *fill_zero_op = block->AllOps()[4];
+  ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like");
+  ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL);
+  ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL);
+  EXPECT_EQ(fill_zero_op->Input("X"), std::vector<std::string>({"z1"}));
+  EXPECT_EQ(fill_zero_op->Output("Out"),
+            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
+
+  f::OpDesc *grad_op1 = block->AllOps()[5];
+  ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad");
+  ASSERT_EQ(grad_op1->InputNames().size(), 6UL);
+  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op1->Input("X"), std::vector<std::string>({"x1"}));
+  EXPECT_EQ(grad_op1->Input("H"), std::vector<std::string>({"h1"}));
+  EXPECT_EQ(grad_op1->Input("Y"), std::vector<std::string>({"y1"}));
+  EXPECT_EQ(grad_op1->Input("Z"), std::vector<std::string>({"z1"}));
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Y")),
+            std::vector<std::string>({f::GradVarName("y1")}));
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Z")),
+            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("H")),
+            std::vector<std::string>({f::GradVarName("h1")}));
+
+  EXPECT_EQ(var_to_grad.size(), 4UL);
+  EXPECT_EQ(var_to_grad.at("y1"), f::GradVarInfo(f::GradVarName("y1"), 0, 3));
+  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 5));
+  EXPECT_EQ(var_to_grad.at("h1"), f::GradVarInfo(f::GradVarName("h1"), 0, 5));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("y1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("h1")));
+}
+
+TEST(Backward, shared_var) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
+  op1->SetType("rowwise_add");
+  op1->SetInput("X", {"x1"});
+  op1->SetInput("b", {"b1"});
+  op1->SetOutput("Out", {"out1"});
+
+  f::OpDesc *op2 = block->AppendOp();
+  op2->SetType("mul");
+  op2->SetInput("X", {"out1"});
+  op2->SetInput("Y", {"y2"});
+  op2->SetOutput("Out", {"out2"});
+
+  f::OpDesc *op3 = block->AppendOp();
+  op3->SetType("rowwise_add");
+  op3->SetInput("X", {"out1"});
+  op3->SetInput("b", {"b3"});
+  op3->SetOutput("Out", {"out3"});
+
+  auto target = f::VarDesc("out3");
+  target.SetShape({1});
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad =
+      AppendBackward(program, target, std::unordered_set<std::string>{});
+
+  ASSERT_EQ(block->AllOps().size(), 8UL);
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDesc *grad_op3 = block->AllOps()[4];
+  ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out3")}));
+  EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0"}));
+  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b3")}));
+
+  f::OpDesc *grad_op4 = block->AllOps()[5];
+  ASSERT_EQ(grad_op4->Type(), "mul_grad");
+  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
+  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
+  EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"y2"}));
+  EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out2"}));
+  EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out2")}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@1"}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
+            std::vector<std::string>({f::GradVarName("y2")}));
+
+  f::OpDesc *sum_op = block->AllOps()[6];
+  ASSERT_EQ(sum_op->Type(), "sum");
+  ASSERT_EQ(sum_op->InputNames().size(), 1UL);
+  ASSERT_EQ(sum_op->OutputNames().size(), 1UL);
+  EXPECT_EQ(sum_op->Input("X"),
+            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0",
+                                      f::GradVarName("out1") + "@RENAME@1"}));
+  EXPECT_EQ(sum_op->Output("Out"),
+            std::vector<std::string>({f::GradVarName("out1")}));
+
+  f::OpDesc *grad_op1 = block->AllOps()[7];
+  ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b1")}));
+
+  EXPECT_EQ(var_to_grad.size(), 6UL);
+  EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
+  EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
+  EXPECT_EQ(var_to_grad.at("out1"),
+            f::GradVarInfo(f::GradVarName("out1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 7));
+  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 7));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
+}
+
+TEST(Backward, half_backward) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  auto *op1 = block->AppendOp();
+  op1->SetType("minus");
+  op1->SetInput("X", {"a"});
+  op1->SetInput("Y", {"b"});
+  op1->SetOutput("Out", {"out"});
+
+  auto target = f::VarDesc("out");
+  target.SetShape({1});
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {"b"});
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+  auto ops = block->AllOps();
+  ASSERT_EQ(3UL, ops.size());
+
+  EXPECT_EQ(var_to_grad.size(), 2UL);
+  EXPECT_EQ(var_to_grad.at("a"),
+            f::GradVarInfo(f::GradVarName("a"), 0, forward_len + 1));
+}
diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9550159155c28247797a6caa5fc01c64a0c5f99f
--- /dev/null
+++ b/paddle/fluid/framework/block_desc.cc
@@ -0,0 +1,190 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+
+VarDesc *BlockDesc::Var(const std::string &name) {
+  auto it = vars_.find(name);
+  if (it != vars_.end()) {
+    return it->second.get();
+  }
+  need_update_ = true;
+  auto *var = new VarDesc(name);
+  vars_[name].reset(var);
+  return var;
+}
+
+VarDesc *BlockDesc::FindVar(const std::string &name) const {
+  auto it = vars_.find(name);
+  if (it == vars_.end()) {
+    return nullptr;
+  }
+  return it->second.get();
+}
+
+bool BlockDesc::HasVar(const std::string &name) const {
+  return vars_.find(name) != vars_.end();
+}
+
+VarDesc *BlockDesc::FindVarRecursive(const std::string &name) const {
+  if (name == kEmptyVarName) return nullptr;
+
+  auto it = vars_.find(name);
+  if (it == vars_.end()) {
+    return Parent() == kNoneBlockIndex ? nullptr
+                                       : ParentBlock()->FindVarRecursive(name);
+  }
+  return it->second.get();
+}
+
+VarDesc &BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) {
+  VarDesc *res = FindVarRecursive(name_bytes);
+  if (res == nullptr) {
+    res = Var(name_bytes);
+  }
+  return *res;
+}
+
+bool BlockDesc::HasVarRecursive(const std::string &name) const {
+  return FindVarRecursive(name) != nullptr;
+}
+
+std::vector<VarDesc *> BlockDesc::AllVars() const {
+  std::vector<VarDesc *> res;
+  for (const auto &p : vars_) {
+    res.push_back(p.second.get());
+  }
+  return res;
+}
+
+OpDesc *BlockDesc::AppendOp() {
+  need_update_ = true;
+  ops_.emplace_back(new OpDesc(this));
+  return ops_.back().get();
+}
+
+void BlockDesc::AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc) {
+  need_update_ = true;
+  ops_.emplace_back(std::move(op_desc));
+}
+
+OpDesc *BlockDesc::PrependOp() {
+  need_update_ = true;
+  ops_.emplace_front(new OpDesc(this));
+  return ops_.front().get();
+}
+
+void BlockDesc::RemoveOp(size_t s, size_t e) {
+  if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
+    return;
+  }
+  need_update_ = true;
+  for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) {
+    auto names = (*it)->InputArgumentNames();
+    for (auto n : names) {
+      // TODO(typhoonzero): delete vars if no other op use it.
+      VLOG(3) << "deleting var " << n;
+    }
+  }
+  ops_.erase(ops_.begin() + s, ops_.begin() + e);
+}
+
+std::vector<OpDesc *> BlockDesc::AllOps() const {
+  std::vector<OpDesc *> res;
+  for (const auto &op : ops_) {
+    res.push_back(op.get());
+  }
+  return res;
+}
+
+void BlockDesc::Flush() {
+  for (auto &op_desc : ops_) {
+    op_desc->Flush();
+  }
+
+  if (need_update_) {
+    auto &op_field = *this->desc_->mutable_ops();
+    this->ClearPBOps();
+    op_field.Reserve(static_cast<int>(ops_.size()));
+    for (auto &op_desc : ops_) {
+      op_field.AddAllocated(op_desc->Proto());
+    }
+    auto &var_field = *this->desc_->mutable_vars();
+    this->ClearPBVars();
+    var_field.Reserve(static_cast<int>(vars_.size()));
+    for (auto &var_desc : vars_) {
+      var_field.AddAllocated(var_desc.second->Proto());
+    }
+    need_update_ = false;
+  }
+}
+
+BlockDesc *BlockDesc::ParentBlock() const {
+  if (this->desc_->parent_idx() == kNoneBlockIndex) {
+    return nullptr;
+  }
+  return prog_->MutableBlock(static_cast<size_t>(this->desc_->parent_idx()));
+}
+
+proto::BlockDesc *BlockDesc::Proto() {
+  Flush();
+  return desc_;
+}
+
+BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
+    : prog_(prog), desc_(desc), need_update_(false) {
+  for (const proto::VarDesc &var_desc : desc_->vars()) {
+    vars_[var_desc.name()].reset(new VarDesc(var_desc));
+  }
+  for (const proto::OpDesc &op_desc : desc_->ops()) {
+    ops_.emplace_back(new OpDesc(op_desc, prog, this));
+  }
+}
+
+BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
+                     ProgramDesc *prog)
+    : prog_(prog), desc_(desc) {
+  need_update_ = true;
+  for (auto &op : other.ops_) {
+    ops_.emplace_back(new OpDesc(*op->Proto(), prog, this));
+  }
+  for (auto &it : other.vars_) {
+    auto *var = new VarDesc(*it.second);
+    vars_[it.first].reset(var);
+  }
+}
+
+void BlockDesc::ClearPBOps() {
+  auto ops = this->desc_->mutable_ops();
+  while (!ops->empty()) {
+    // we do not own the OpDesc, so release the ownership.
+    ops->ReleaseLast();
+  }
+}
+
+void BlockDesc::ClearPBVars() {
+  auto vars = this->desc_->mutable_vars();
+  while (!vars->empty()) {
+    // we do not own the VarDesc, so release the ownership.
+    vars->ReleaseLast();
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f7eca3878ff6174090c7b0dd4904f5604ac8dc6
--- /dev/null
+++ b/paddle/fluid/framework/block_desc.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <deque>
+#include <memory>
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/proto_desc.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace framework {
+
+class ProgramDesc;
+
+// Each Protobuf Message, we provide a XXXBind class. In that class, we optimize
+// read/write speed. Only when we want the protobuf message, the local changes
+// will be synchronized (by `Sync` method).
+
+class BlockDesc {
+ public:
+  BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc);
+
+  BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog);
+
+  ~BlockDesc() {
+    this->ClearPBVars();
+    this->ClearPBOps();
+  }
+
+  int32_t ID() const { return desc_->idx(); }
+
+  int32_t Parent() const { return desc_->parent_idx(); }
+
+  VarDesc *Var(const std::string &name_bytes);
+
+  VarDesc *FindVar(const std::string &name_bytes) const;
+
+  bool HasVar(const std::string &var_name) const;
+
+  VarDesc *FindVarRecursive(const std::string &name_bytes) const;
+
+  VarDesc &FindRecursiveOrCreateVar(const std::string &name_bytes);
+
+  bool HasVarRecursive(const std::string &var_name) const;
+
+  std::set<std::string> LocalVarNames() const {
+    std::set<std::string> var_names;
+    for (auto &var : vars_) {
+      var_names.insert(var.first);
+    }
+    return var_names;
+  }
+
+  std::vector<VarDesc *> AllVars() const;
+
+  BlockDesc *ParentBlock() const;
+
+  OpDesc *AppendOp();
+
+  void AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc);
+
+  OpDesc *PrependOp();
+
+  void RemoveOp(size_t s, size_t e);
+
+  std::vector<OpDesc *> AllOps() const;
+
+  size_t OpSize() const { return ops_.size(); }
+
+  OpDesc *Op(int idx) { return ops_.at(idx).get(); }
+
+  void Flush();
+
+  proto::BlockDesc *Proto();
+
+  ProgramDesc *Program() { return this->prog_; }
+
+ private:
+  void ClearPBOps();
+  void ClearPBVars();
+
+ private:
+  ProgramDesc *prog_;       // not_own
+  proto::BlockDesc *desc_;  // not_own
+  bool need_update_;
+
+  std::deque<std::unique_ptr<OpDesc>> ops_;
+  std::unordered_map<std::string, std::unique_ptr<VarDesc>> vars_;
+
+  DISABLE_COPY_AND_ASSIGN(BlockDesc);
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5acf4fb39bbeb6bd45d215c962f10f0333578c02
--- /dev/null
+++ b/paddle/fluid/framework/channel.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>  // for size_t
+
+namespace paddle {
+namespace framework {
+
+// Channel is the abstract class of buffered and un-buffered channels.
+template <typename T>
+class Channel {
+ public:
+  virtual bool Send(T*) = 0;
+  virtual bool Receive(T*) = 0;
+  virtual size_t Cap() = 0;
+  virtual void Close() = 0;
+  virtual ~Channel() {}
+};
+
+// Forward declaration of channel implementations.
+namespace details {
+template <typename T>
+class Buffered;
+template <typename T>
+class UnBuffered;
+}  // namespace details
+
+template <typename T>
+Channel<T>* MakeChannel(size_t buffer_size) {
+  if (buffer_size > 0) {
+    return new details::Buffered<T>(buffer_size);
+  }
+  return new details::UnBuffered<T>();
+}
+
+template <typename T>
+void CloseChannel(Channel<T>* ch) {
+  ch->Close();
+}
+
+}  // namespace framework
+}  // namespace paddle
+
+#include "paddle/fluid/framework/details/buffered_channel.h"
+#include "paddle/fluid/framework/details/unbuffered_channel.h"
diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..953fa40fec8c0480726b44760a3a4c7f59c80a85
--- /dev/null
+++ b/paddle/fluid/framework/channel_test.cc
@@ -0,0 +1,510 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/channel.h"
+
+#include <chrono>
+#include <thread>
+
+#include "gtest/gtest.h"
+
+using paddle::framework::Channel;
+using paddle::framework::MakeChannel;
+using paddle::framework::CloseChannel;
+using paddle::framework::details::Buffered;
+using paddle::framework::details::UnBuffered;
+
+void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
+  unsigned sum_send = 0;
+  std::thread t([&]() {
+    for (int i = 0; i < 5; i++) {
+      EXPECT_EQ(ch->Send(&i), true);
+      sum_send += i;
+    }
+  });
+  for (int i = 0; i < 5; i++) {
+    int recv;
+    EXPECT_EQ(ch->Receive(&recv), true);
+    EXPECT_EQ(recv, i);
+  }
+
+  CloseChannel(ch);
+  t.join();
+  EXPECT_EQ(sum_send, 10U);
+  delete ch;
+}
+
+TEST(Channel, MakeAndClose) {
+  using paddle::framework::details::Buffered;
+  using paddle::framework::details::UnBuffered;
+  {
+    // MakeChannel should return a buffered channel is buffer_size > 0.
+    auto ch = MakeChannel<int>(10);
+    EXPECT_NE(dynamic_cast<Buffered<int> *>(ch), nullptr);
+    EXPECT_EQ(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
+    CloseChannel(ch);
+    delete ch;
+  }
+  {
+    // MakeChannel should return an un-buffered channel is buffer_size = 0.
+    auto ch = MakeChannel<int>(0);
+    EXPECT_EQ(dynamic_cast<Buffered<int> *>(ch), nullptr);
+    EXPECT_NE(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
+    CloseChannel(ch);
+    delete ch;
+  }
+}
+
+TEST(Channel, SufficientBufferSizeDoesntBlock) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  for (size_t i = 0; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Send(&i), true);  // should not block
+  }
+
+  size_t out;
+  for (size_t i = 0; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Receive(&out), true);  // should not block
+    EXPECT_EQ(out, i);
+  }
+  CloseChannel(ch);
+  delete ch;
+}
+
+// This tests that a  channel must return false
+// on send and receive performed after closing the channel.
+// Receive will only return false after close when queue is empty.
+// By creating separate threads for sending and receiving, we make this
+// function able to test both buffered and unbuffered channels.
+void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
+  const size_t data = 5;
+  std::thread send_thread{[&]() {
+    size_t i = data;
+    EXPECT_EQ(ch->Send(&i), true);  // should not block
+  }};
+
+  std::thread recv_thread{[&]() {
+    size_t i;
+    EXPECT_EQ(ch->Receive(&i), true);  // should not block
+    EXPECT_EQ(i, data);
+  }};
+
+  send_thread.join();
+  recv_thread.join();
+
+  // After closing send should return false. Receive should
+  // also return false as there is no data in queue.
+  CloseChannel(ch);
+  send_thread = std::thread{[&]() {
+    size_t i = data;
+    EXPECT_EQ(ch->Send(&i), false);  // should return false
+  }};
+  recv_thread = std::thread{[&]() {
+    size_t i;
+    // should return false because channel is closed and queue is empty
+    EXPECT_EQ(ch->Receive(&i), false);
+  }};
+
+  send_thread.join();
+  recv_thread.join();
+}
+
+TEST(Channel, SendReceiveClosedBufferedChannelPanics) {
+  size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  SendReceiveWithACloseChannelShouldPanic(ch);
+  delete ch;
+}
+
+TEST(Channel, SendReceiveClosedUnBufferedChannelPanics) {
+  auto ch = MakeChannel<size_t>(0);
+  SendReceiveWithACloseChannelShouldPanic(ch);
+  delete ch;
+}
+
+TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+
+  for (size_t i = 0; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Send(&i), true);  // sending should not block
+  }
+
+  size_t out;
+  for (size_t i = 0; i < buffer_size / 2; ++i) {
+    EXPECT_EQ(ch->Receive(&out), true);  // receiving should not block
+    EXPECT_EQ(out, i);
+  }
+
+  CloseChannel(ch);
+
+  for (size_t i = buffer_size / 2; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Receive(&out),
+              true);  // receving should return residual values.
+    EXPECT_EQ(out, i);
+  }
+
+  for (size_t i = 0; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Receive(&out),
+              false);  // receiving on closed channel should return false
+  }
+  delete ch;
+}
+
+TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  size_t sum = 0;
+  std::thread t([&]() {
+    // Try to write more than buffer size.
+    for (size_t i = 0; i < 2 * buffer_size; ++i) {
+      if (i < buffer_size)
+        EXPECT_EQ(ch->Send(&i), true);  // should block after 10 iterations
+      else
+        EXPECT_EQ(ch->Send(&i), false);
+      sum += i;
+    }
+  });
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+  EXPECT_EQ(sum, 45U);
+
+  CloseChannel(ch);
+  t.join();
+  delete ch;
+}
+
+TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) {
+  auto ch = MakeChannel<int>(0);
+  RecevingOrderEqualToSendingOrder(ch);
+}
+
+TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel) {
+  auto ch = MakeChannel<int>(10);
+  RecevingOrderEqualToSendingOrder(ch);
+}
+
+void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked because of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          EXPECT_EQ(ch->Receive(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+
+  // Verify that all the threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+
+  // Explicitly close the channel
+  // This should unblock all receivers
+  CloseChannel(ch);
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
+  using paddle::framework::details::Buffered;
+  using paddle::framework::details::UnBuffered;
+
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  bool send_success[num_threads];
+
+  // Launches threads that try to write and are blocked because of no readers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    send_success[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended, bool *success) {
+          int data = 10;
+          *success = ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i], &send_success[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  if (dynamic_cast<Buffered<int> *>(ch)) {
+    // If ch is Buffered, atleast 4 threads must be blocked.
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (!thread_ended[i]) ct++;
+    }
+    EXPECT_GE(ct, 4);
+  } else {
+    // If ch is UnBuffered, all the threads should be blocked.
+    for (size_t i = 0; i < num_threads; i++) {
+      EXPECT_EQ(thread_ended[i], false);
+    }
+  }
+  // Explicitly close the thread
+  // This should unblock all senders
+  CloseChannel(ch);
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  if (dynamic_cast<Buffered<int> *>(ch)) {
+    // Verify that only 1 send was successful
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (send_success[i]) ct++;
+    }
+    // Only 1 send must be successful
+    EXPECT_EQ(ct, 1);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+// This tests that closing a buffered channel also unblocks
+//  any receivers waiting on the channel
+TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(1);
+  ChannelCloseUnblocksReceiversTest(ch);
+  delete ch;
+}
+
+// This tests that closing a buffered channel also unblocks
+//  any senders waiting for channel to have write space
+TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(1);
+  ChannelCloseUnblocksSendersTest(ch);
+  delete ch;
+}
+
+// This tests that closing an unbuffered channel also unblocks
+//  unblocks any receivers waiting for senders
+TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(0);
+  ChannelCloseUnblocksReceiversTest(ch);
+  delete ch;
+}
+
+// This tests that closing an unbuffered channel also unblocks
+//  unblocks any senders waiting for senders
+TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(0);
+  ChannelCloseUnblocksReceiversTest(ch);
+  delete ch;
+}
+
+TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
+  auto ch = MakeChannel<int>(0);
+  unsigned sum_send = 0;
+  // Send should block after three iterations
+  // since we only have three receivers.
+  std::thread t([&]() {
+    // Try to send more number of times
+    // than receivers
+    for (int i = 0; i < 4; i++) {
+      ch->Send(&i);
+      sum_send += i;
+    }
+  });
+  for (int i = 0; i < 3; i++) {
+    int recv;
+    ch->Receive(&recv);
+    EXPECT_EQ(recv, i);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
+  EXPECT_EQ(sum_send, 3U);
+
+  CloseChannel(ch);
+  t.join();
+  delete ch;
+}
+
+TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
+  auto ch = MakeChannel<int>(0);
+  unsigned sum_send = 0;
+  unsigned sum_receive = 0;
+  // The receiver should block after 5
+  // iterations, since there are only 5 senders.
+  std::thread t([&]() {
+    for (int i = 0; i < 8; i++) {
+      int recv;
+      ch->Receive(&recv);  // should block after the fifth iteration.
+      EXPECT_EQ(recv, i);
+      sum_receive += i;
+    }
+  });
+  for (int i = 0; i < 5; i++) {
+    ch->Send(&i);
+    sum_send += i;
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  EXPECT_EQ(sum_send, 10U);
+  EXPECT_EQ(sum_receive, 10U);
+  // send three more elements
+  for (int i = 5; i < 8; i++) {
+    ch->Send(&i);
+    sum_send += i;
+  }
+
+  CloseChannel(ch);
+  t.join();
+  EXPECT_EQ(sum_send, 28U);
+  EXPECT_EQ(sum_receive, 28U);
+  delete ch;
+}
+
+// This tests that destroying a channel unblocks
+//  any senders waiting for channel to have write space
+void ChannelDestroyUnblockSenders(Channel<int> *ch) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  bool send_success[num_threads];
+
+  // Launches threads that try to write and are blocked because of no readers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    send_success[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended, bool *success) {
+          int data = 10;
+          *success = ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i], &send_success[i]);
+  }
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  bool is_buffered_channel = false;
+  if (dynamic_cast<Buffered<int> *>(ch)) is_buffered_channel = true;
+
+  if (is_buffered_channel) {
+    // If channel is buffered, verify that atleast 4 threads are blocked
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (thread_ended[i] == false) ct++;
+    }
+    // Atleast 4 threads must be blocked
+    EXPECT_GE(ct, 4);
+  } else {
+    // Verify that all the threads are blocked
+    for (size_t i = 0; i < num_threads; i++) {
+      EXPECT_EQ(thread_ended[i], false);
+    }
+  }
+  // Explicitly destroy the channel
+  delete ch;
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  // Count number of successfuld sends
+  int ct = 0;
+  for (size_t i = 0; i < num_threads; i++) {
+    if (send_success[i]) ct++;
+  }
+
+  if (is_buffered_channel) {
+    // Only 1 send must be successful
+    EXPECT_EQ(ct, 1);
+  } else {
+    // In unbuffered channel, no send should be successful
+    EXPECT_EQ(ct, 0);
+  }
+
+  // Join all threads
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+// This tests that destroying a channel also unblocks
+//  any receivers waiting on the channel
+void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked because of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          // All reads should return false
+          EXPECT_EQ(ch->Receive(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  // Verify that all threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+  // delete the channel
+  delete ch;
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
+  size_t buffer_size = 1;
+  auto ch = MakeChannel<int>(buffer_size);
+  ChannelDestroyUnblockReceivers(ch);
+}
+
+TEST(Channel, BufferedChannelDestroyUnblocksSendersTest) {
+  size_t buffer_size = 1;
+  auto ch = MakeChannel<int>(buffer_size);
+  ChannelDestroyUnblockSenders(ch);
+}
+
+// This tests that destroying an unbuffered channel also unblocks
+//  unblocks any receivers waiting for senders
+TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(0);
+  ChannelDestroyUnblockReceivers(ch);
+}
+
+TEST(Channel, UnbufferedChannelDestroyUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(0);
+  ChannelDestroyUnblockSenders(ch);
+}
diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c6dd28455b02aa71d8ed09d8c2c81397a6f9955
--- /dev/null
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_device_transform.h"
+
+namespace paddle {
+namespace framework {
+
+static const platform::DeviceContext* GetDeviceContext(
+    const platform::Place& src_place, const platform::Place& dst_place) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+
+  if (platform::is_gpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    return pool.Get(src_place);
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    return pool.Get(dst_place);
+  } else {
+    PADDLE_THROW(
+        "Currently, model parallelism is only supported between CPU and CUDA");
+  }
+}
+
+void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
+                     Tensor* out) {
+  VLOG(3) << "DeviceTransform in, src_place " << in.place()
+          << " dst_place: " << dst_place;
+  auto* dev_ctx = GetDeviceContext(in.place(), dst_place);
+  dev_ctx->Wait();
+  Copy(in, dst_place, *dev_ctx, out);
+  dev_ctx->Wait();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_device_transform.h b/paddle/fluid/framework/data_device_transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c4559f586aaf3cc055f9b53b050b3f3a97573bd
--- /dev/null
+++ b/paddle/fluid/framework/data_device_transform.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
+                     Tensor* out);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f740f9b3268be31973774674bdea9eb404f718ed
--- /dev/null
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -0,0 +1,168 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
+
+class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input1 of test op");
+    AddOutput("output", "output of test op");
+    AddAttr<bool>("use_gpu", "force to use gpu kernel").SetDefault(false);
+    AddComment("This is test op");
+  }
+};
+
+class TestOpWithKernel : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+  OpKernelType GetExpectedKernelType(
+      const ExecutionContext& ctx) const override {
+    if (Attr<bool>("use_gpu")) {
+      VLOG(3) << "force use gpu kernel";
+      return OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0));
+    } else {
+      VLOG(3) << "use default kernel";
+      return OpKernelType(proto::DataType::FP32,
+                          ctx.Input<Tensor>("input")->place());
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TestKernel : public OpKernel<float> {
+ public:
+  void Compute(const ExecutionContext& ctx) const {
+    std::cout << ctx.op().DebugString() << std::endl;
+
+    const Tensor* input = ctx.Input<Tensor>("input");
+
+    std::cout << "input place:" << input->place() << std::endl;
+    auto* output = ctx.Output<framework::LoDTensor>("output");
+    output->Resize(input->dims());
+    output->mutable_data<T>(ctx.GetPlace());
+
+    operators::TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
+        input, input, output, ctx.template device_context<DeviceContext>(),
+        AddFunctor<T>());
+    functor.Run();
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(
+    test_op, paddle::framework::TestOpWithKernel,
+    paddle::framework::OpKernelTestProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(
+    test_op,
+    paddle::framework::TestKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    test_op,
+    paddle::framework::TestKernel<paddle::platform::CUDADeviceContext, float>);
+
+static void BuildVar(const std::string& param_name,
+                     std::initializer_list<const char*> arguments,
+                     paddle::framework::proto::OpDesc::Var* var) {
+  var->set_parameter(param_name);
+  for (auto& arg_name : arguments) {
+    *var->mutable_arguments()->Add() = arg_name;
+  }
+}
+
+TEST(Operator, CPUtoGPU) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  InitDevices();
+
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace cpu_place;
+
+  // create an op to run on CPU
+  paddle::framework::proto::OpDesc cpu_op_desc;
+  cpu_op_desc.set_type("test_op");
+  BuildVar("input", {"IN1"}, cpu_op_desc.add_inputs());
+  BuildVar("output", {"OUT1"}, cpu_op_desc.add_outputs());
+
+  auto cpu_op = paddle::framework::OpRegistry::CreateOp(cpu_op_desc);
+  // prepare input
+  auto* in_t = scope.Var("IN1")->GetMutable<LoDTensor>();
+  auto* src_ptr = in_t->mutable_data<float>({2, 3}, CPUPlace());
+  for (int i = 0; i < 2 * 3; ++i) {
+    src_ptr[i] = static_cast<float>(i);
+  }
+
+  // get output
+  auto* output = scope.Var("OUT1");
+  cpu_op->Run(scope, cpu_place);
+
+  auto* output_ptr = output->Get<LoDTensor>().data<float>();
+  for (int i = 0; i < 2 * 3; ++i) {
+    ASSERT_EQ(output_ptr[i], static_cast<float>(i) * 2);
+  }
+
+  // create an op to run on GPU
+  paddle::framework::proto::OpDesc gpu_op_desc;
+  gpu_op_desc.set_type("test_op");
+  BuildVar("input", {"OUT1"}, gpu_op_desc.add_inputs());
+  BuildVar("output", {"OUT2"}, gpu_op_desc.add_outputs());
+
+  auto attr = gpu_op_desc.mutable_attrs()->Add();
+  attr->set_name("use_gpu");
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
+  attr->set_b(true);
+
+  auto gpu_op = paddle::framework::OpRegistry::CreateOp(gpu_op_desc);
+
+  paddle::platform::CUDAPlace cuda_place(0);
+  // get output
+  auto* output2 = scope.Var("OUT2");
+  gpu_op->Run(scope, cuda_place);
+  VLOG(3) << "after gpu_op run";
+
+  // auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  auto dev_ctx = pool.Get(cuda_place);
+
+  paddle::framework::Tensor output_tensor;
+  Copy(output2->Get<LoDTensor>(), paddle::platform::CPUPlace(), *dev_ctx,
+       &output_tensor);
+
+  dev_ctx->Wait();
+  float* output2_ptr = output_tensor.data<float>();
+  for (int i = 0; i < 2 * 3; ++i) {
+    ASSERT_EQ(output2_ptr[i], static_cast<float>(i) * 4);
+  }
+}
diff --git a/paddle/fluid/framework/data_layout.h b/paddle/fluid/framework/data_layout.h
new file mode 100644
index 0000000000000000000000000000000000000000..b72f13f2e8f28556c195e65e3096b6ef1ba9e13a
--- /dev/null
+++ b/paddle/fluid/framework/data_layout.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cctype>
+#include <ostream>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+enum class DataLayout {
+  kNHWC = 0,
+  kNCHW = 1,
+  kAnyLayout = 2,
+};
+
+inline DataLayout StringToDataLayout(const std::string& str) {
+  std::string s(str);
+  for (size_t i = 0; i < s.size(); ++i) {
+    s[i] = toupper(s[i]);
+  }
+
+  if (s == "NHWC") {
+    return DataLayout::kNHWC;
+  } else if (s == "NCHW") {
+    return DataLayout::kNCHW;
+  } else if (s == "ANYLAYOUT") {
+    return DataLayout::kAnyLayout;
+  } else {
+    PADDLE_THROW("Unknown storage order string: %s", s);
+  }
+}
+
+inline std::string DataLayoutToString(const DataLayout& data_layout) {
+  switch (data_layout) {
+    case DataLayout::kNHWC:
+      return "NHWC";
+    case DataLayout::kNCHW:
+      return "NCHW";
+    case DataLayout::kAnyLayout:
+      return "ANY_LAYOUT";
+    default:
+      PADDLE_THROW("unknown DataLayou %d", data_layout);
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& out, const DataLayout& l) {
+  out << DataLayoutToString(l);
+  return out;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c546a508fe1bc1c6e1608cb16a2e1f708a083895
--- /dev/null
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -0,0 +1,91 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/data_layout_transform.h"
+
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace framework {
+
+std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to) {
+  PADDLE_ENFORCE_NE(from, to,
+                    "layout transform should transform different layout");
+  if (from == DataLayout::kNCHW && to == DataLayout::kNHWC) {
+    return {0, 2, 3, 1};
+  } else if (from == DataLayout::kNHWC && to == DataLayout::kNCHW) {
+    return {0, 3, 1, 2};
+  } else {
+    PADDLE_THROW("unsupported transform");
+  }
+}
+
+struct CastDataLayout {
+  CastDataLayout(const platform::DeviceContext* ctx,
+                 const std::vector<int>& axis, const framework::Tensor& in,
+                 framework::Tensor* out)
+      : in_(in), out_(out), ctx_(ctx), axis_(axis) {}
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+  const std::vector<int> axis_;
+
+  template <typename T>
+  void operator()() {
+    auto place = ctx_->GetPlace();
+
+    if (platform::is_cpu_place(place)) {
+      operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
+      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      trans4(*context, in_, out_, axis_);
+    } else {
+      PADDLE_THROW("Unsupport CPU <-> GPU!");
+    }
+  }
+};
+
+void TransDataLayout(const OpKernelType& kernel_type_for_var,
+                     const OpKernelType& expected_kernel_type, const Tensor& in,
+                     Tensor* out) {
+  PADDLE_ENFORCE(
+      platform::places_are_same_class(kernel_type_for_var.place_,
+                                      expected_kernel_type.place_),
+      "TransDataLayout only support DataLayout transform on same place!");
+
+  PADDLE_ENFORCE(arity(in.dims()) == 4, "Input Arity only support 4!");
+
+  auto& pool = platform::DeviceContextPool::Instance();
+
+  auto src_dim = in.dims();
+  std::vector<int64_t> dst_dim;
+
+  auto axis = GetAxis(kernel_type_for_var.data_layout_,
+                      expected_kernel_type.data_layout_);
+  dst_dim.resize(axis.size());
+  for (size_t i = 0; i < axis.size(); i++) {
+    dst_dim[i] = src_dim[axis[i]];
+  }
+
+  out->Resize(make_ddim(dst_dim));
+  out->mutable_data(expected_kernel_type.place_, in.type());
+
+  framework::VisitDataType(
+      framework::ToDataType(in.type()),
+      CastDataLayout(pool.Get(expected_kernel_type.place_), axis, in, out));
+
+  out->set_layout(expected_kernel_type.data_layout_);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..862405fbf466cd5e5fc819f42cc7392b1c4ca624
--- /dev/null
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -0,0 +1,31 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+
+std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
+
+void TransDataLayout(const OpKernelType& kernel_type_for_var,
+                     const OpKernelType& expected_kernel_type, const Tensor& in,
+                     Tensor* out);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_layout_transform_test.cc b/paddle/fluid/framework/data_layout_transform_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..99eb46bde34b089c3da65885748a1e77fe40c700
--- /dev/null
+++ b/paddle/fluid/framework/data_layout_transform_test.cc
@@ -0,0 +1,44 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/data_layout_transform.h"
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/platform/device_context.h"
+
+TEST(DataTransform, DataLayoutFunction) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  auto place = CPUPlace();
+  Tensor in = Tensor();
+  Tensor out = Tensor();
+  in.mutable_data<double>(make_ddim({2, 3, 1, 2}), place);
+  in.set_layout(DataLayout::kNHWC);
+
+  auto kernel_nhwc = OpKernelType(proto::DataType::FP32, place,
+                                  DataLayout::kNHWC, LibraryType::kPlain);
+  auto kernel_ncwh = OpKernelType(proto::DataType::FP32, place,
+                                  DataLayout::kNCHW, LibraryType::kPlain);
+
+  TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out);
+
+  EXPECT_TRUE(out.layout() == DataLayout::kNCHW);
+  EXPECT_TRUE(out.dims() == make_ddim({2, 2, 3, 1}));
+
+  TransDataLayout(kernel_ncwh, kernel_nhwc, in, &out);
+
+  EXPECT_TRUE(in.layout() == DataLayout::kNHWC);
+  EXPECT_TRUE(in.dims() == make_ddim({2, 3, 1, 2}));
+}
\ No newline at end of file
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9575d01af8875cc21061979e54ce0612d8a7f3a5
--- /dev/null
+++ b/paddle/fluid/framework/data_transform.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_transform.h"
+
+#include "paddle/fluid/framework/data_device_transform.h"
+#include "paddle/fluid/framework/data_layout_transform.h"
+#include "paddle/fluid/framework/data_type_transform.h"
+
+namespace paddle {
+namespace framework {
+
+static void PassTensorData(Tensor* from, Tensor* to) {
+  to->ShareDataWith(*from);
+  *from = Tensor();
+}
+
+void DataTransform(const OpKernelType& expected_kernel_type,
+                   const OpKernelType& kernel_type_for_var,
+                   const Tensor& input_tensor, Tensor* output_tensor) {
+  bool transformed = false;
+  Tensor in;
+  in.ShareDataWith(input_tensor);
+  Tensor out;
+
+  // do layout transform
+  if (NeedTransformLayout(expected_kernel_type.data_layout_,
+                          kernel_type_for_var.data_layout_)) {
+    TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out);
+    transformed = true;
+    PassTensorData(&out, &in);
+  }
+
+  if (expected_kernel_type.data_type_ != kernel_type_for_var.data_type_) {
+    TransDataType(kernel_type_for_var, expected_kernel_type, in, &out);
+    transformed = true;
+    PassTensorData(&out, &in);
+  }
+
+  // do device transform
+  if (!platform::is_same_place(kernel_type_for_var.place_,
+                               expected_kernel_type.place_)) {
+    TransDataDevice(in, expected_kernel_type.place_, &out);
+    transformed = true;
+    PassTensorData(&out, &in);
+  }
+
+  PADDLE_ENFORCE(transformed, "No transform is applied, please check!");
+  // get output data
+  output_tensor->ShareDataWith(in);
+}
+
+void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
+                            Variable& out_var) {
+  if (in_var.IsType<LoDTensor>()) {
+    auto& in_lod_tensor = in_var.Get<LoDTensor>();
+    auto* tran_lod_tensor = out_var.GetMutable<LoDTensor>();
+    tran_lod_tensor->set_lod(in_lod_tensor.lod());
+    tran_lod_tensor->set_layout(in_lod_tensor.layout());
+    tran_lod_tensor->ShareDataWith(tensor);
+  } else if (in_var.IsType<SelectedRows>()) {
+    auto& in_selected_rows = in_var.Get<SelectedRows>();
+    auto* trans_selected_rows = out_var.GetMutable<SelectedRows>();
+    trans_selected_rows->set_height(in_selected_rows.height());
+    trans_selected_rows->set_rows(in_selected_rows.rows());
+    trans_selected_rows->mutable_value()->ShareDataWith(tensor);
+  } else {
+    PADDLE_THROW("unknown var type");
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..70d3a174accc8beda06c550bc5ac9ee97897eb2e
--- /dev/null
+++ b/paddle/fluid/framework/data_transform.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace framework {
+
+void DataTransform(const OpKernelType& expected_kernel_type,
+                   const OpKernelType& kernel_type_for_var,
+                   const Tensor& input_tensor, Tensor* out);
+
+void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
+                            Variable& out_var);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a527f0d0c12806045d21b1cf279ccfd2cf73c8d
--- /dev/null
+++ b/paddle/fluid/framework/data_type.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <typeindex>
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+inline proto::DataType ToDataType(std::type_index type) {
+  using namespace paddle::framework::proto;
+  if (typeid(float).hash_code() == type.hash_code()) {
+    return DataType::FP32;
+  } else if (typeid(double).hash_code() == type.hash_code()) {
+    return DataType::FP64;
+  } else if (typeid(int).hash_code() == type.hash_code()) {
+    return DataType::INT32;
+  } else if (typeid(int64_t).hash_code() == type.hash_code()) {
+    return DataType::INT64;
+  } else if (typeid(bool).hash_code() == type.hash_code()) {
+    return DataType::BOOL;
+  } else {
+    PADDLE_THROW("Not supported");
+  }
+}
+
+inline std::type_index ToTypeIndex(proto::DataType type) {
+  using namespace paddle::framework::proto;
+  switch (type) {
+    case DataType::FP32:
+      return typeid(float);
+    case DataType::FP64:
+      return typeid(double);
+    case DataType::INT32:
+      return typeid(int);
+    case DataType::INT64:
+      return typeid(int64_t);
+    case DataType::BOOL:
+      return typeid(bool);
+    default:
+      PADDLE_THROW("Not support type %d", type);
+  }
+}
+
+template <typename Visitor>
+inline void VisitDataType(proto::DataType type, Visitor visitor) {
+  using namespace paddle::framework::proto;
+  switch (type) {
+    case DataType::FP32:
+      visitor.template operator()<float>();
+      break;
+    case DataType::FP64:
+      visitor.template operator()<double>();
+      break;
+    case DataType::INT32:
+      visitor.template operator()<int>();
+      break;
+    case DataType::INT64:
+      visitor.template operator()<int64_t>();
+      break;
+    case DataType::BOOL:
+      visitor.template operator()<bool>();
+      break;
+    default:
+      PADDLE_THROW("Not supported");
+  }
+}
+
+inline std::string DataTypeToString(const proto::DataType type) {
+  using namespace paddle::framework::proto;
+  switch (type) {
+    case DataType::FP16:
+      return "float16";
+    case DataType::FP32:
+      return "float32";
+    case DataType::FP64:
+      return "float64";
+    case DataType::INT16:
+      return "int16";
+    case DataType::INT32:
+      return "int32";
+    case DataType::INT64:
+      return "int64";
+    case DataType::BOOL:
+      return "bool";
+    default:
+      PADDLE_THROW("Not support type %d", type);
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& out,
+                                const proto::DataType& type) {
+  out << DataTypeToString(type);
+  return out;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6921927305aa3a7dee801ead888737a1ab93fc8e
--- /dev/null
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_type_transform.h"
+
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename InType, typename OutType>
+struct CastDataTypeFunctor {
+  HOSTDEVICE inline OutType operator()(InType in) const {
+    return static_cast<OutType>(in);
+  }
+};
+
+template <typename InType>
+struct CastDataType {
+  CastDataType(const framework::Tensor& in, framework::Tensor* out,
+               const platform::DeviceContext* ctx)
+      : in_(in), out_(out), ctx_(ctx) {}
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+
+  template <typename OutType>
+  void operator()() {
+    auto* in_begin = in_.data<InType>();
+    auto* in_end = in_begin + in_.numel();
+    auto* out_begin = out_->mutable_data<OutType>(in_.place());
+
+    if (platform::is_cpu_place(in_.place())) {
+      platform::Transform<platform::CPUDeviceContext> trans;
+      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      trans(*context, in_begin, in_end, out_begin,
+            CastDataTypeFunctor<InType, OutType>());
+    } else {
+      // TODO(dzhwinter): enhance Copy CPU<->GPU with different data type?
+      PADDLE_THROW("Unsupport CPU <-> GPU!");
+    }
+  }
+};
+
+void TransDataType(const OpKernelType& kernel_type_for_var,
+                   const OpKernelType& expected_kernel_type, const Tensor& in,
+                   Tensor* out) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+
+  out->Resize(in.dims());
+  auto src_type = kernel_type_for_var.data_type_;
+  auto dst_type = expected_kernel_type.data_type_;
+  auto ctx = pool.Get(in.place());
+
+  switch (src_type) {
+    case proto::DataType::FP32:
+      framework::VisitDataType(dst_type, CastDataType<float>(in, out, ctx));
+      break;
+    case proto::DataType::FP64:
+      framework::VisitDataType(dst_type, CastDataType<double>(in, out, ctx));
+      break;
+    case proto::DataType::INT32:
+      framework::VisitDataType(dst_type, CastDataType<int>(in, out, ctx));
+      break;
+    case proto::DataType::INT64:
+      framework::VisitDataType(dst_type, CastDataType<int64_t>(in, out, ctx));
+      break;
+    case proto::DataType::BOOL:
+      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
+      break;
+    default:
+      PADDLE_THROW("Not support type %d", src_type);
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_type_transform.h b/paddle/fluid/framework/data_type_transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..830cced093913839ddef2841b3de1017dc2bc426
--- /dev/null
+++ b/paddle/fluid/framework/data_type_transform.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
+
+void TransDataType(const OpKernelType& kernel_type_for_var,
+                   const OpKernelType& expected_kernel_type, const Tensor& in,
+                   Tensor* out);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_type_transform_test.cc b/paddle/fluid/framework/data_type_transform_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..88dbc51b21718e2261f1c9485177621a827485e2
--- /dev/null
+++ b/paddle/fluid/framework/data_type_transform_test.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_type_transform.h"
+
+#include "gtest/gtest.h"
+
+TEST(DataTypeTransform, CPUTransform) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  auto place = CPUPlace();
+
+  Tensor in;
+  Tensor out;
+
+  float* ptr = in.mutable_data<float>(make_ddim({2, 3}), place);
+  int data_number = 2 * 3;
+
+  for (int i = 0; i < data_number; ++i) {
+    ptr[i] = i / 3;
+  }
+
+  auto kernel_fp32 = OpKernelType(proto::DataType::FP32, place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_fp64 = OpKernelType(proto::DataType::FP64, place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_int32 = OpKernelType(proto::DataType::INT32, place,
+                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+
+  TransDataType(kernel_fp32, kernel_fp64, in, &out);
+  double* out_data_double = out.data<double>();
+  for (int i = 0; i < data_number; ++i) {
+    ASSERT_EQ(out_data_double[i], static_cast<double>(i / 3));
+  }
+
+  TransDataType(kernel_fp32, kernel_int32, in, &out);
+  int* out_data_int = out.data<int>();
+  for (int i = 0; i < data_number; ++i) {
+    ASSERT_EQ(out_data_int[i], static_cast<int>(i / 3));
+  }
+}
diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f063ee2e6dd81e66b7b74aa23e9967d865c4d297
--- /dev/null
+++ b/paddle/fluid/framework/ddim.cc
@@ -0,0 +1,318 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+/// @cond HIDDEN
+
+template <int i>
+Dim<i> make_dim(const int64_t* d) {
+  return Dim<i>(*d, make_dim<i - 1>(d + 1));
+}
+
+template <>
+Dim<1> make_dim<1>(const int64_t* d) {
+  return Dim<1>(*d);
+}
+
+void make_ddim(DDim& ddim, const int64_t* dims, int n) {
+  switch (n) {
+    case 1:
+      ddim = make_dim<1>(dims);
+      break;
+    case 2:
+      ddim = make_dim<2>(dims);
+      break;
+    case 3:
+      ddim = make_dim<3>(dims);
+      break;
+    case 4:
+      ddim = make_dim<4>(dims);
+      break;
+    case 5:
+      ddim = make_dim<5>(dims);
+      break;
+    case 6:
+      ddim = make_dim<6>(dims);
+      break;
+    case 7:
+      ddim = make_dim<7>(dims);
+      break;
+    case 8:
+      ddim = make_dim<8>(dims);
+      break;
+    case 9:
+      ddim = make_dim<9>(dims);
+      break;
+    default:
+      PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions.");
+  }
+}
+
+/// @endcond
+
+DDim make_ddim(std::initializer_list<int64_t> dims) {
+  DDim result(make_dim(0));
+  make_ddim(result, dims.begin(), dims.size());
+  return result;
+}
+
+DDim make_ddim(const std::vector<int64_t>& dims) {
+  DDim result(make_dim(0));
+  make_ddim(result, &dims[0], dims.size());
+  return result;
+}
+
+DDim make_ddim(const std::vector<int>& dims) {
+  std::vector<int64_t> res(dims.size());
+  std::transform(dims.begin(), dims.end(), res.begin(),
+                 [](int d) { return static_cast<int64_t>(d); });
+  return make_ddim(res);
+}
+
+/// @cond HIDDEN
+// XXX For some reason, putting this in an anonymous namespace causes errors
+class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
+ public:
+  explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
+
+  template <int D>
+  int64_t& operator()(Dim<D>& dim) const {
+    return dim[idx_];
+  }
+
+ private:
+  int idx_;
+};
+
+class DynamicConstIndexer : public boost::static_visitor<int64_t> {
+ public:
+  explicit DynamicConstIndexer(int idx) : idx_(idx) {}
+
+  template <int D>
+  int64_t operator()(const Dim<D>& dim) const {
+    return dim[idx_];
+  }
+
+ private:
+  int idx_;
+};
+
+/// @endcond
+
+int64_t& DDim::operator[](int idx) {
+  return boost::apply_visitor(DynamicMutableIndexer(idx), var);
+}
+
+int64_t DDim::operator[](int idx) const {
+  return boost::apply_visitor(DynamicConstIndexer(idx), var);
+}
+
+int DDim::size() const { return arity(*this); }
+
+bool DDim::operator==(DDim d) const {
+  if (var.which() != d.getVar().which()) {
+    return false;
+  } else {
+    std::vector<int64_t> v1 = vectorize(*this);
+    std::vector<int64_t> v2 = vectorize(d);
+
+    for (unsigned int i = 0; i < v1.size(); i++) {
+      if (v1[i] != v2[i]) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+}
+
+bool DDim::operator!=(DDim d) const { return !(*this == d); }
+
+DDim DDim::operator+(DDim d) const {
+  std::vector<int64_t> v1 = vectorize(*this);
+  std::vector<int64_t> v2 = vectorize(d);
+
+  std::vector<int64_t> v3;
+
+  assert(v1.size() == v2.size());
+
+  for (unsigned int i = 0; i < v1.size(); i++) {
+    v3.push_back(v1[i] + v2[i]);
+  }
+
+  return make_ddim(v3);
+}
+
+DDim DDim::operator*(DDim d) const {
+  std::vector<int64_t> v1 = vectorize(*this);
+  std::vector<int64_t> v2 = vectorize(d);
+
+  std::vector<int64_t> v3;
+
+  assert(v1.size() == v2.size());
+
+  for (unsigned int i = 0; i < v1.size(); i++) {
+    v3.push_back(v1[i] * v2[i]);
+  }
+
+  return make_ddim(v3);
+}
+
+int64_t get(const DDim& ddim, int idx) { return ddim[idx]; }
+
+void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
+
+/// @cond HIDDEN
+struct VectorizeVisitor : public boost::static_visitor<> {
+  std::vector<int64_t>& vector;
+
+  explicit VectorizeVisitor(std::vector<int64_t>& v) : vector(v) {}
+
+  template <typename T>
+  void operator()(const T& t) {
+    vector.push_back(t.head);
+    this->operator()(t.tail);
+  }
+
+  void operator()(const Dim<1>& t) { vector.push_back(t.head); }
+};
+/// @endcond
+
+std::vector<int64_t> vectorize(const DDim& ddim) {
+  std::vector<int64_t> result;
+  VectorizeVisitor visitor(result);
+  boost::apply_visitor(visitor, ddim);
+  return result;
+}
+
+// NOTE: framework::vectorize converts to type int64_t
+//       which does not fit cudnn inputs.
+std::vector<int> vectorize2int(const DDim& ddim) {
+  std::vector<int64_t> temp = vectorize(ddim);
+  std::vector<int> result(temp.begin(), temp.end());
+  return result;
+}
+
+struct ProductVisitor : public boost::static_visitor<int64_t> {
+  template <int D>
+  int64_t operator()(const Dim<D>& dim) {
+    return product(dim);
+  }
+};
+
+int64_t product(const DDim& ddim) {
+  ProductVisitor visitor;
+  return boost::apply_visitor(visitor, ddim);
+}
+
+struct SliceVectorizeVisitor : public boost::static_visitor<> {
+  std::vector<int64_t>& vector;
+  int begin;
+  int end;
+
+  SliceVectorizeVisitor(std::vector<int64_t>& v, int b, int e)
+      : vector(v), begin(b), end(e) {
+    PADDLE_ENFORCE(begin < end,
+                   "Begin index must be less than end index in ddim slice.");
+    PADDLE_ENFORCE(begin >= 0,
+                   "Begin index can't be less than zero in ddim slice.");
+  }
+
+  template <int S>
+  void operator()(const Dim<S>& dim) {
+    if (begin == 0) {
+      vector.push_back(dim.head);
+    } else {
+      --begin;
+    }
+    --end;
+    if (end > 0) {
+      this->operator()(dim.tail);
+    }
+  }
+
+  void operator()(const Dim<1>& dim) {
+    PADDLE_ENFORCE(end == 1, "End index in ddim slice is out of bound.");
+    vector.push_back(dim.head);
+  }
+};
+
+DDim slice_ddim(const DDim& dim, int begin, int end) {
+  std::vector<int64_t> vec;
+  vec.reserve(end - begin);
+  SliceVectorizeVisitor visitor(vec, begin, end);
+  boost::apply_visitor(visitor, dim);
+  return make_ddim(vec);
+}
+
+/// \cond HIDDEN
+
+struct ArityVisitor : boost::static_visitor<int> {
+  template <int D>
+  int operator()(Dim<D>) const {
+    return D;
+  }
+};
+
+/// \endcond
+
+int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }
+
+/// \cond HIDDEN
+
+struct DDimPrinter : boost::static_visitor<void> {
+  std::ostream& os;
+  explicit DDimPrinter(std::ostream& os_) : os(os_) {}
+
+  template <typename T>
+  void operator()(const T& t) {
+    os << t;
+  }
+};
+
+/// \endcond
+
+std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
+  DDimPrinter printer(os);
+  boost::apply_visitor(printer, ddim);
+  return os;
+}
+
+DDim::DDim(std::initializer_list<int64_t> init_list) {
+  *this = make_ddim(init_list);
+}
+
+DDim flatten_to_2d(const DDim& src, int num_col_dims) {
+  int rank = src.size();
+  return make_ddim({product(slice_ddim(src, 0, num_col_dims)),
+                    product(slice_ddim(src, num_col_dims, rank))});
+}
+
+DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); }
+
+DDim stride(const DDim& ddim) {
+  std::vector<int64_t> strides(ddim.size());
+  strides[ddim.size() - 1] = 1;
+  for (int i = ddim.size() - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * ddim[i + 1];
+  }
+  return framework::make_ddim(strides);
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h
new file mode 100644
index 0000000000000000000000000000000000000000..750ab787abb72fa3f2984caf58354e327750aa3d
--- /dev/null
+++ b/paddle/fluid/framework/ddim.h
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <initializer_list>
+#include <stdexcept>
+#include <vector>
+#include "paddle/fluid/framework/dim.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+
+/**
+ * \brief A dynamically sized dimension.
+ *
+ * The number of dimensions must be between [1, 9].
+ */
+struct DDim {
+  typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
+                         Dim<8>, Dim<9>>
+      DDimVar;
+  DDimVar var;
+
+  DDim() : var(Dim<1>()) {}
+
+  template <int D>
+  explicit DDim(const Dim<D>& in) : var(in) {}
+
+  /*implicit*/ DDim(std::initializer_list<int64_t> init_list);
+
+  template <int D>
+  DDim& operator=(const Dim<D>& in) {
+    var = in;
+    return *this;
+  }
+
+  int64_t& operator[](int idx);
+  int64_t operator[](int idx) const;
+
+  template <typename Visitor>
+  typename Visitor::result_type apply_visitor(Visitor& visitor) {
+    return var.apply_visitor(visitor);
+  }
+
+  template <typename Visitor>
+  typename Visitor::result_type apply_visitor(Visitor& visitor) const {
+    return var.apply_visitor(visitor);
+  }
+
+  DDimVar getVar() { return var; }
+
+  bool operator==(DDim d) const;
+
+  bool operator!=(DDim d) const;
+
+  DDim operator+(DDim d) const;
+
+  DDim operator*(DDim d) const;
+
+  int size() const;
+};
+
+/**
+ * \brief Make a DDim from std::vector<int64_t>
+ *
+ * \param dims An vector of ints. Must be sized between [1, 9]
+ */
+DDim make_ddim(const std::vector<int64_t>& dims);
+
+DDim make_ddim(const std::vector<int>& dims);
+
+/**
+ * \brief Make a DDim from an initializer list
+ *
+ * \param dims An initializer list of ints. Must be sized between [1, 9]
+ *
+ */
+DDim make_ddim(std::initializer_list<int64_t> dims);
+
+int64_t get(const DDim& dim, int idx);
+void set(DDim& dim, int idx, int val);
+
+std::vector<int64_t> vectorize(const DDim& ddim);
+std::vector<int> vectorize2int(const DDim& ddim);
+
+int64_t product(const DDim& ddim);
+
+/**
+ * \brief Slice a ddim
+ *
+ * Slice dim with [begin, end).
+ * e.g.  DDim d = make_ddim({1,2,3,4,5});
+ *       slice_ddim(d, 1, 3); ====> {2,3}
+ */
+DDim slice_ddim(const DDim& dim, int begin, int end);
+
+/**
+ * \brief What is the length of this dimension?
+ *
+ * \param Dynamic dimension to inspect
+ */
+
+int arity(const DDim& ddim);
+
+std::ostream& operator<<(std::ostream&, const DDim&);
+
+// Reshape a tensor to a matrix. The matrix's first dimension(column length)
+// will be the product of tensor's first `num_col_dims` dimensions.
+DDim flatten_to_2d(const DDim& src, int num_col_dims);
+
+DDim flatten_to_1d(const DDim& src);
+
+DDim stride(const DDim& ddim);
+}  // namespace framework
+}  // namespace paddle
+
+namespace boost {
+
+template <typename T>
+T get(const paddle::framework::DDim& in) {
+  return boost::get<T>(in.var);
+}
+
+}  // namespace boost
diff --git a/paddle/fluid/framework/ddim_test.cc b/paddle/fluid/framework/ddim_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..18d305a4036840066a7d9c999a7e73db863274d7
--- /dev/null
+++ b/paddle/fluid/framework/ddim_test.cc
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <sstream>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ddim.h"
+
+TEST(DDim, Equality) {
+  // construct a DDim from an initialization list
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({9, 1, 5});
+  EXPECT_EQ(ddim[0], 9);
+  EXPECT_EQ(ddim[1], 1);
+  EXPECT_EQ(ddim[2], 5);
+
+  // construct a DDim from a vector
+  std::vector<int64_t> vec({9, 1, 5});
+  paddle::framework::DDim vddim = paddle::framework::make_ddim(vec);
+  EXPECT_EQ(ddim[0], 9);
+  EXPECT_EQ(ddim[1], 1);
+  EXPECT_EQ(ddim[2], 5);
+
+  // mutate a DDim
+  ddim[1] = 2;
+  EXPECT_EQ(ddim[1], 2);
+  paddle::framework::set(ddim, 0, 6);
+  EXPECT_EQ(paddle::framework::get(ddim, 0), 6);
+
+  // vectorize a DDim
+  std::vector<int64_t> res_vec = paddle::framework::vectorize(vddim);
+  EXPECT_EQ(res_vec[0], 9);
+  EXPECT_EQ(res_vec[1], 1);
+  EXPECT_EQ(res_vec[2], 5);
+  paddle::framework::Dim<3> d(3, 2, 1);
+  res_vec = paddle::framework::vectorize(paddle::framework::DDim(d));
+  EXPECT_EQ(res_vec[0], 3);
+  EXPECT_EQ(res_vec[1], 2);
+  EXPECT_EQ(res_vec[2], 1);
+
+  // add two DDims
+  paddle::framework::DDim ddim_sum = ddim + vddim;
+  EXPECT_EQ(ddim_sum[0], 15);
+  EXPECT_EQ(ddim_sum[1], 3);
+  EXPECT_EQ(ddim_sum[2], 10);
+
+  // multiply two DDims
+  paddle::framework::DDim ddim_mul = ddim * vddim;
+  EXPECT_EQ(ddim_mul[0], 54);
+  EXPECT_EQ(ddim_mul[1], 2);
+  EXPECT_EQ(ddim_mul[2], 25);
+
+  // arity of a DDim
+  EXPECT_EQ(paddle::framework::arity(ddim), 3);
+  EXPECT_EQ(ddim.size(), 3);
+
+  // product of a DDim
+  EXPECT_EQ(paddle::framework::product(vddim), 45);
+  EXPECT_EQ(
+      paddle::framework::product(paddle::framework::make_ddim({3, 2, 5, 3})),
+      90);
+
+  // slice a DDim
+  paddle::framework::DDim ddim2 =
+      paddle::framework::make_ddim({1, 2, 3, 4, 5, 6});
+  paddle::framework::DDim ss = paddle::framework::slice_ddim(ddim2, 2, 5);
+  EXPECT_EQ(arity(ss), 3);
+  EXPECT_EQ(ss[0], 3);
+  EXPECT_EQ(ss[1], 4);
+  EXPECT_EQ(ss[2], 5);
+  paddle::framework::DDim ss2 = paddle::framework::slice_ddim(ddim2, 0, 6);
+  EXPECT_EQ(arity(ss2), 6);
+  EXPECT_EQ(ss2[0], 1);
+  EXPECT_EQ(ss2[1], 2);
+  EXPECT_EQ(ss2[2], 3);
+  EXPECT_EQ(ss2[3], 4);
+  EXPECT_EQ(ss2[4], 5);
+  EXPECT_EQ(ss2[5], 6);
+}
+
+TEST(DDim, Print) {
+  // print a DDim
+  std::stringstream ss;
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 3, 4});
+  ss << ddim;
+  EXPECT_EQ("2, 3, 4", ss.str());
+}
diff --git a/paddle/fluid/framework/details/buffered_channel.h b/paddle/fluid/framework/details/buffered_channel.h
new file mode 100644
index 0000000000000000000000000000000000000000..88faf3acf7c17b0cb3770a8910e400a1f6688f5f
--- /dev/null
+++ b/paddle/fluid/framework/details/buffered_channel.h
@@ -0,0 +1,142 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+// Four of the properties of Buffered Channel:
+// - A send to a full channel blocks temporarily until a receive from the
+// channel or the channel is closed.
+// - A receive from an empty channel blocks temporarily until a send to the
+// channel or the channel is closed.
+// - A send to a closed channel returns false immediately.
+// - A receive from a closed channel returns false immediately.
+
+template <typename T>
+class Buffered : public paddle::framework::Channel<T> {
+  friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
+  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
+
+ public:
+  virtual bool Send(T*);
+  virtual bool Receive(T*);
+  virtual size_t Cap() { return cap_; }
+  virtual void Close();
+  virtual ~Buffered();
+
+ private:
+  size_t cap_;
+  std::mutex mu_;
+  std::condition_variable empty_cond_var_;
+  std::condition_variable full_cond_var_;
+  std::condition_variable destructor_cond_var_;
+  std::deque<T> channel_;
+  std::atomic<bool> closed_{false};
+  std::atomic<unsigned> send_ctr{0};
+  std::atomic<unsigned> recv_ctr{0};
+
+  Buffered(size_t cap) : cap_(cap), closed_(false) {
+    PADDLE_ENFORCE_GT(cap, 0);
+  }
+
+  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
+};
+
+template <typename T>
+bool Buffered<T>::Send(T* item) {
+  bool ret = false;
+  if (closed_) {
+    return ret;
+  }
+  send_ctr++;
+  std::unique_lock<std::mutex> lock(mu_);
+  full_cond_var_.wait(lock,
+                      [this]() { return channel_.size() < cap_ || closed_; });
+  if (!closed_) {
+    channel_.push_back(std::move(*item));
+    lock.unlock();
+    empty_cond_var_.notify_one();
+    ret = true;
+  }
+  send_ctr--;
+  destructor_cond_var_.notify_one();
+  return ret;
+}
+
+template <typename T>
+bool Buffered<T>::Receive(T* item) {
+  bool ret = false;
+  // Once the channel has been closed and all data has been consumed,
+  // just return false. Don't even try acquiring the mutex.
+  if (closed_ && channel_.empty()) {
+    return false;
+  }
+  recv_ctr++;
+  std::unique_lock<std::mutex> lock(mu_);
+  empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
+  if (!channel_.empty()) {
+    *item = std::move(channel_.front());
+    channel_.pop_front();
+    full_cond_var_.notify_one();
+    ret = true;
+  }
+  recv_ctr--;
+  destructor_cond_var_.notify_one();
+  return ret;
+}
+
+template <typename T>
+void Buffered<T>::Close() {
+  if (closed_) {
+    return;
+  }
+  std::unique_lock<std::mutex> lock(mu_);
+  closed_ = true;
+  NotifyAllParticipants(&lock);
+}
+
+template <typename T>
+Buffered<T>::~Buffered() {
+  std::unique_lock<std::mutex> lock(mu_);
+  closed_ = true;
+  channel_.clear();
+  NotifyAllParticipants(&lock);
+
+  // The destructor must wait for all readers and writers to complete their task
+  // The channel has been closed, so we will not accept new readers and writers
+  lock.lock();
+  destructor_cond_var_.wait(
+      lock, [this]() { return send_ctr == 0 && recv_ctr == 0; });
+}
+
+template <typename T>
+void Buffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
+  lock->unlock();
+  full_cond_var_.notify_all();
+  empty_cond_var_.notify_all();
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/details/cow_ptr.h b/paddle/fluid/framework/details/cow_ptr.h
similarity index 100%
rename from paddle/framework/details/cow_ptr.h
rename to paddle/fluid/framework/details/cow_ptr.h
diff --git a/paddle/fluid/framework/details/cow_ptr_test.cc b/paddle/fluid/framework/details/cow_ptr_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d2142af277c0b356d83941b3baab1947cce31dac
--- /dev/null
+++ b/paddle/fluid/framework/details/cow_ptr_test.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/framework/details/cow_ptr.h"
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+TEST(COWPtr, all) {
+  COWPtr<int> ptr(new int{0});
+  ASSERT_EQ(ptr.Data(), 0);
+  COWPtr<int> ptr2 = ptr;
+  ASSERT_EQ(ptr2.Data(), 0);
+  ASSERT_EQ(&ptr2.Data(), &ptr.Data());
+  *ptr2.MutableData() = 10;
+  ASSERT_EQ(ptr.Data(), 0);
+  ASSERT_EQ(ptr2.Data(), 10);
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..d73604ad185a66ade0168f585d1951d0d7d4a5f9
--- /dev/null
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -0,0 +1,142 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/grad_op_desc_maker.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/var_type_inference.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+enum OpInfoFillType {
+  kOperator = 0,
+  kOpProtoAndCheckerMaker = 1,
+  kGradOpDescMaker = 2,
+  kVarTypeInference = 3,
+  kShapeInference = 4
+};
+
+template <typename T>
+struct OpInfoFillTypeID {
+  static constexpr OpInfoFillType ID() {
+    return std::is_base_of<OperatorBase, T>::value
+               ? kOperator
+               : (std::is_base_of<OpProtoAndCheckerMaker, T>::value
+                      ? kOpProtoAndCheckerMaker
+                      : (std::is_base_of<GradOpDescMakerBase, T>::value
+                             ? kGradOpDescMaker
+                             : (std::is_base_of<VarTypeInference, T>::value
+                                    ? kVarTypeInference
+                                    : (std::is_base_of<InferShapeBase, T>::value
+                                           ? kShapeInference
+                                           : static_cast<OpInfoFillType>(
+                                                 -1)))));
+  }
+};
+
+template <typename T, OpInfoFillType = OpInfoFillTypeID<T>::ID()>
+struct OpInfoFiller;
+
+template <size_t I, bool at_end, typename... ARGS>
+class OperatorRegistrarRecursive;
+
+template <size_t I, typename... ARGS>
+class OperatorRegistrarRecursive<I, false, ARGS...> {
+ public:
+  using T = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
+  OperatorRegistrarRecursive(const char* op_type, OpInfo* info) {
+    OpInfoFiller<T> fill;
+    fill(op_type, info);
+    constexpr auto size = sizeof...(ARGS);
+    OperatorRegistrarRecursive<I + 1, I + 1 == size, ARGS...> reg(op_type,
+                                                                  info);
+    (void)(reg);
+  }
+};
+
+template <size_t I, typename... ARGS>
+class OperatorRegistrarRecursive<I, true, ARGS...> {
+ public:
+  OperatorRegistrarRecursive(const char* op_type, OpInfo* info) {}
+};
+
+template <typename T>
+struct OpInfoFiller<T, kOperator> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->creator_ = [](const std::string& type, const VariableNameMap& inputs,
+                        const VariableNameMap& outputs,
+                        const AttributeMap& attrs) {
+      return new T(type, inputs, outputs, attrs);
+    };
+  }
+};
+
+template <typename T>
+struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->proto_ = new proto::OpProto;
+    info->checker_ = new OpAttrChecker();
+    auto maker = T(info->proto_, info->checker_);
+    maker.Validate();
+    info->proto_->set_type(op_type);
+    PADDLE_ENFORCE(
+        info->proto_->IsInitialized(),
+        "Fail to initialize %s's OpProto, because %s is not initialized",
+        op_type, info->proto_->InitializationErrorString());
+  }
+};
+
+template <typename T>
+struct OpInfoFiller<T, kGradOpDescMaker> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->grad_op_maker_ = [](
+        const OpDesc& fwd_op,
+        const std::unordered_set<std::string>& no_grad_set,
+        std::unordered_map<std::string, std::string>* grad_to_var,
+        const std::vector<BlockDesc*>& grad_block) {
+      T maker(fwd_op, no_grad_set, grad_to_var, grad_block);
+      return maker();
+    };
+  }
+};
+
+template <typename T>
+struct OpInfoFiller<T, kVarTypeInference> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->infer_var_type_ = [](const OpDesc& fwd_op, BlockDesc* block) {
+      T inference;
+      inference(fwd_op, block);
+    };
+  }
+};
+
+template <typename T>
+struct OpInfoFiller<T, kShapeInference> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->infer_shape_ = [](InferShapeContext* ctx) {
+      T inference;
+      inference(ctx);
+    };
+  }
+};
+
+}  // namespace details
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/unbuffered_channel.h b/paddle/fluid/framework/details/unbuffered_channel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c9424928cb7029aac813e7b2f29f81a0093f836
--- /dev/null
+++ b/paddle/fluid/framework/details/unbuffered_channel.h
@@ -0,0 +1,174 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+
+#include "paddle/fluid/framework/channel.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+// Four of the properties of UnBuffered Channel:
+// - A send to a channel blocks temporarily until a receive from the
+// channel or the channel is closed.
+// - A receive from a channel blocks temporarily until a send to the
+// channel or the channel is closed.
+// - A send to a closed channel returns false immediately.
+// - A receive from a closed channel returns false immediately.
+template <typename T>
+class UnBuffered : public paddle::framework::Channel<T> {
+  friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
+  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
+
+ public:
+  virtual bool Send(T*);
+  virtual bool Receive(T*);
+  virtual size_t Cap() { return 0; }
+  virtual void Close();
+  virtual ~UnBuffered();
+
+ private:
+  std::mutex mu_ch_;
+  // Mutex for readers and writers who are waiting for other reader
+  // and writer to complete execution
+  std::recursive_mutex mu_read_, mu_write_;
+  // reader_found_ is set true when a reader is ready to accept data
+  // writer_found_ is set true when a writer is ready to send data
+  // A transaction occurs only when both are true
+  std::atomic<bool> reader_found_{false}, writer_found_{false};
+  std::condition_variable cv_channel_;
+  std::condition_variable_any cv_reader_, cv_writer_, cv_destructor_;
+  T* item{nullptr};
+  std::atomic<bool> closed_{false};
+  std::atomic<unsigned> send_ctr{0};
+  std::atomic<unsigned> recv_ctr{0};
+
+  UnBuffered() : closed_(false) {}
+
+  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
+};
+
+// This function implements the concept of how data should
+// be sent from a writer to a reader.
+template <typename T>
+bool UnBuffered<T>::Send(T* data) {
+  bool ret = false;
+  if (closed_) {
+    return ret;
+  }
+  send_ctr++;
+  // Prevent other writers from entering
+  std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
+  writer_found_ = true;
+  std::unique_lock<std::recursive_mutex> cv_lock(mu_write_);
+  // If writer comes first, it should wait till a reader arrives
+  cv_writer_.wait(cv_lock,
+                  [this]() { return reader_found_ == true || closed_; });
+  cv_reader_.notify_one();
+  if (!closed_) {
+    std::unique_lock<std::mutex> channel_lock(mu_ch_);
+    item = data;
+    channel_lock.unlock();
+    cv_channel_.notify_one();
+    channel_lock.lock();
+    cv_channel_.wait(channel_lock,
+                     [this]() { return item == nullptr || closed_; });
+    ret = true;
+  }
+  writer_found_ = false;
+  send_ctr--;
+  cv_destructor_.notify_one();
+  return ret;
+}
+
+// This function implements the concept of how
+// data that was sent by a writer is read from a reader.
+template <typename T>
+bool UnBuffered<T>::Receive(T* data) {
+  bool ret = false;
+  // If channel is closed, we don't even want any reader to enter.
+  // Unlike a buffered channel, an unbuffered channel does not allow
+  // readers to read after closing because there is no buffer to be consumed.
+  if (closed_) return ret;
+  recv_ctr++;
+  // Prevent other readers from entering
+  std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
+  reader_found_ = true;
+  std::unique_lock<std::recursive_mutex> cv_lock{mu_read_};
+  // If reader comes first, it should wait till a writer arrives
+  cv_reader_.wait(cv_lock,
+                  [this]() { return writer_found_ == true || closed_; });
+  cv_writer_.notify_one();
+  if (!closed_) {
+    std::unique_lock<std::mutex> lock_ch{mu_ch_};
+    // Reader should wait for the writer to first write its data
+    cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; });
+    if (!closed_) {
+      *data = std::move(*item);
+      item = nullptr;
+      lock_ch.unlock();
+      ret = true;
+    }
+    cv_channel_.notify_one();
+  }
+  reader_found_ = false;
+  recv_ctr--;
+  cv_destructor_.notify_one();
+  return ret;
+}
+
+// This function implements the sequence of events
+// that take place once the channel is closed.
+template <typename T>
+void UnBuffered<T>::Close() {
+  if (closed_) {
+    return;
+  }
+  std::unique_lock<std::mutex> lock(mu_ch_);
+  item = nullptr;
+  closed_ = true;
+  NotifyAllParticipants(&lock);
+}
+
+// This function implements the sequence of events
+// that are executed once the object of an UnBuffered
+// channel is destroyed.
+template <typename T>
+UnBuffered<T>::~UnBuffered() {
+  std::unique_lock<std::mutex> lock(mu_ch_);
+  item = nullptr;
+  closed_ = true;
+  NotifyAllParticipants(&lock);
+  lock.lock();
+  cv_destructor_.wait(lock,
+                      [this]() { return send_ctr == 0 && recv_ctr == 0; });
+}
+
+// This function notifies all the readers, writers and
+// the channel condition variables.
+template <typename T>
+void UnBuffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
+  lock->unlock();
+  cv_writer_.notify_all();
+  cv_channel_.notify_all();
+  cv_reader_.notify_all();
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/dim.h b/paddle/fluid/framework/dim.h
new file mode 100644
index 0000000000000000000000000000000000000000..3938fd3df5b54443fcbaebf600840ccf2337a173
--- /dev/null
+++ b/paddle/fluid/framework/dim.h
@@ -0,0 +1,421 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <type_traits>
+
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace framework {
+
+// Statically sized, statically indexed dimension
+template <int i>
+struct Dim {
+  static constexpr int dimensions = i;
+
+  template <typename... Args>
+  HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
+    static_assert(sizeof...(_tail) == i - 1,
+                  "Dim initialized with the wrong number of parameters");
+  }
+
+  HOSTDEVICE
+  Dim(int64_t _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {}
+
+  HOSTDEVICE
+  Dim() : head(0), tail() {}
+
+  /** Construct a Dim from a linear index and size.  Uses Fortran order
+   * indexing. */
+  HOSTDEVICE
+  Dim(int64_t idx, const Dim<i>& size)
+      : head(idx % size.head), tail(idx / size.head, size.tail) {}
+
+  /** Construct a Dim with each dimension set to the given index */
+  HOSTDEVICE
+  Dim(int64_t idx) : head(idx), tail(idx) {}
+
+  HOSTDEVICE
+  bool operator==(const Dim<i>& o) const {
+    return (head == o.head) && (tail == o.tail);
+  }
+
+  HOSTDEVICE
+  bool operator!=(const Dim<i>& o) const { return !(*this == o); }
+
+  HOSTDEVICE
+  int64_t& operator[](int idx);
+  HOSTDEVICE
+  int64_t operator[](int idx) const;
+
+  HOST std::string to_string() const;
+
+  int64_t head;
+  Dim<i - 1> tail;
+};
+
+// Base case specialization
+template <>
+struct Dim<1> {
+  static constexpr int dimensions = 1;
+
+  HOSTDEVICE
+  Dim(int64_t _head) : head(_head) {}
+
+  HOSTDEVICE
+  Dim() : head(0) {}
+
+  HOSTDEVICE
+  Dim(int idx, const Dim<1>& size) : head(idx) {
+#ifndef __CUDA_ARCH__
+    if (idx >= size.head) {
+      throw std::invalid_argument("Index out of range.");
+    }
+#else
+    PADDLE_ASSERT(idx < size.head);
+#endif
+  }
+
+  HOSTDEVICE
+  bool operator==(const Dim<1>& o) const { return (head == o.head); }
+
+  HOSTDEVICE
+  bool operator!=(const Dim<1>& o) const { return !(*this == o); }
+
+  HOSTDEVICE
+  int64_t& operator[](int idx);
+  HOSTDEVICE
+  int64_t operator[](int idx) const;
+
+  int64_t head;
+};
+
+namespace {
+
+// Helper for accessing Dim classes
+template <int i>
+struct DimGetter {
+  // Return a copy if Dim is const
+  template <typename D>
+  HOSTDEVICE static int64_t impl(const D& d) {
+    return DimGetter<i - 1>::impl(d.tail);
+  }
+  // Return a reference if Dim is mutable
+  template <typename D>
+  HOSTDEVICE static int64_t& impl(D& d) {
+    return DimGetter<i - 1>::impl(d.tail);
+  }
+};
+
+// Eureka! We found the element!
+template <>
+struct DimGetter<0> {
+  // Return a copy if Dim is const
+  template <typename D>
+  HOSTDEVICE static int64_t impl(const D& d) {
+    return d.head;
+  }
+  // Return a reference if Dim is mutable
+  template <typename D>
+  HOSTDEVICE static int64_t& impl(D& d) {
+    return d.head;
+  }
+};
+
+template <int D>
+HOSTDEVICE int64_t& indexer(Dim<D>& dim, int idx) {
+#ifndef __CUDA_ARCH__
+  if (idx < 0) {
+    throw std::invalid_argument("Tried to access a negative dimension");
+  }
+#else
+  PADDLE_ASSERT(idx >= 0);
+#endif
+  if (idx == 0) {
+    return dim.head;
+  }
+  return indexer(dim.tail, idx - 1);
+}
+
+template <>
+HOSTDEVICE int64_t& indexer<1>(Dim<1>& dim, int idx) {
+#ifndef __CUDA_ARCH__
+  if (idx != 0) {
+    throw std::invalid_argument("Invalid index");
+  }
+#else
+  PADDLE_ASSERT(idx == 0);
+#endif
+  return dim.head;
+}
+
+template <int D>
+HOSTDEVICE int64_t indexer(const Dim<D>& dim, int idx) {
+#ifndef __CUDA_ARCH__
+  if (idx < 0) {
+    throw std::invalid_argument("Tried to access a negative dimension");
+  }
+#else
+  PADDLE_ASSERT(idx >= 0);
+#endif
+  if (idx == 0) {
+    return dim.head;
+  }
+  return indexer(dim.tail, idx - 1);
+}
+
+template <>
+HOSTDEVICE int64_t indexer<1>(const Dim<1>& dim, int idx) {
+#ifndef __CUDA_ARCH__
+  if (idx != 0) {
+    throw std::invalid_argument("Invalid index");
+  }
+#else
+  PADDLE_ASSERT(idx == 0);
+#endif
+  return dim.head;
+}
+
+}  // namespace
+// Static access to constant Dim
+template <int i, int l>
+HOSTDEVICE int64_t get(const Dim<l>& d) {
+  return DimGetter<i>::impl(d);
+}
+
+// Static access to mutable Dim
+template <int i, int l>
+HOSTDEVICE int64_t& get(Dim<l>& d) {
+  return DimGetter<i>::impl(d);
+}
+
+// Dynamic access to constant Dim
+template <int l>
+HOSTDEVICE int64_t Dim<l>::operator[](int i) const {
+  return indexer(*this, i);
+}
+
+// Dynamic access to mutable Dim
+template <int l>
+HOSTDEVICE int64_t& Dim<l>::operator[](int i) {
+  return indexer(*this, i);
+}
+
+// Dynamic access to constant Dim
+inline HOSTDEVICE int64_t Dim<1>::operator[](int i) const {
+  return indexer(*this, i);
+}
+
+// Dynamic access to mutable Dim
+inline HOSTDEVICE int64_t& Dim<1>::operator[](int i) {
+  return indexer(*this, i);
+}
+
+// Dynamic access to constant Dim
+// without std::enable_if will try to instantiate this on get<0>(d)
+template <int l>
+HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l>& d,
+                                                               int i) {
+  return d[i];
+}
+
+// Dynamic access to mutable Dim
+template <int l>
+HOSTDEVICE typename std::enable_if<(l > 0), int64_t&>::type get(Dim<l>& d,
+                                                                int i) {
+  return d[i];
+}
+
+// Dot product of two dims
+template <int i>
+HOSTDEVICE int64_t linearize(const Dim<i>& a, const Dim<i>& b) {
+  return a.head * b.head + linearize(a.tail, b.tail);
+}
+
+// Base case dot product of two Dims
+// Notice it is inline because it is no longer a template
+template <>
+HOSTDEVICE inline int64_t linearize(const Dim<1>& a, const Dim<1>& b) {
+  return a.head * b.head;
+}
+
+// Product of a Dim
+template <int i>
+HOSTDEVICE int64_t product(const Dim<i>& a, int prod = 1) {
+  return prod * a.head * product(a.tail);
+}
+
+// Base case product of a Dim
+// Notice it is inline because it is no longer a template
+template <>
+HOSTDEVICE inline int64_t product(const Dim<1>& a, int prod) {
+  return prod * a.head;
+}
+
+// Is 0 <= idx_i < size_i for all i?
+template <int i>
+HOSTDEVICE bool contained(const Dim<i>& idx, const Dim<i>& size) {
+  return ((0 <= idx.head) && (idx.head < size.head) &&
+          contained(idx.tail, size.tail));
+}
+
+// Base case of is 0 <= idx_i < size_i ?
+// Notice it is inline because it is no longer a template
+template <>
+HOSTDEVICE inline bool contained(const Dim<1>& idx, const Dim<1>& size) {
+  return ((0 <= idx.head) && (idx.head < size.head));
+}
+
+/**
+ * \brief Compute exclusive prefix-multiply of a Dim.
+ */
+template <int i>
+HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i>& src, int mul = 1) {
+  return Dim<i>(mul, ex_prefix_mul(src.tail, mul * src.head));
+}
+
+///\cond HIDDEN
+// Base case of ex_prefix_mul
+// Notice it is inline because it is no longer a template
+template <>
+HOSTDEVICE inline Dim<1> ex_prefix_mul(const Dim<1>& src, int mul) {
+  return Dim<1>(mul);
+}
+///\endcond
+
+/**
+ * Add two dimensions together
+ */
+template <int i>
+HOSTDEVICE Dim<i> dim_plus(const Dim<i>& a, const Dim<i>& b) {
+  return Dim<i>(a.head + b.head, dim_plus(a.tail, b.tail));
+}
+
+// Base case
+template <>
+HOSTDEVICE inline Dim<1> dim_plus(const Dim<1>& a, const Dim<1>& b) {
+  return Dim<1>(a.head + b.head);
+}
+
+template <int i>
+HOSTDEVICE Dim<i> operator+(const Dim<i>& lhs, const Dim<i>& rhs) {
+  return dim_plus(lhs, rhs);
+}
+
+/**
+ * Multiply two dimensions together
+ */
+template <int i>
+HOSTDEVICE Dim<i> dim_mult(const Dim<i>& a, const Dim<i>& b) {
+  return Dim<i>(a.head * b.head, dim_mult(a.tail, b.tail));
+}
+
+// Base case
+template <>
+HOSTDEVICE inline Dim<1> dim_mult(const Dim<1>& a, const Dim<1>& b) {
+  return Dim<1>(a.head * b.head);
+}
+
+template <int i>
+HOSTDEVICE Dim<i> operator*(const Dim<i>& lhs, const Dim<i>& rhs) {
+  return dim_mult(lhs, rhs);
+}
+
+/**
+ * \brief Normalize strides to ensure any dimension with extent 1
+ * has stride 0.
+ *
+ * \param size Dim object containing the size of an array
+ * \param stride Dim object containing stride of an array
+ * \return Dim object the same size as \p size with normalized strides
+ *
+ */
+
+template <int i>
+HOSTDEVICE Dim<i> normalize_strides(const Dim<i>& size, const Dim<i>& stride) {
+  int norm_stride = size.head == 1 ? 0 : stride.head;
+  return Dim<i>(norm_stride, normalize_strides(size.tail, stride.tail));
+}
+
+///\cond HIDDEN
+
+template <>
+HOSTDEVICE inline Dim<1> normalize_strides(const Dim<1>& size,
+                                           const Dim<1>& stride) {
+  int norm_stride = size.head == 1 ? 0 : stride.head;
+  return Dim<1>(norm_stride);
+}
+
+///\endcond
+
+/**
+ * Helper function to create a Dim
+ *
+ * \param idxes The type of Dim constructed depends on the number of params
+ *
+ */
+
+template <typename... Args>
+HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) {
+  return Dim<sizeof...(Args)>(idxes...);
+}
+
+// Allows us to output a Dim
+// XXX For some reason, overloading fails to resolve this correctly
+template <int i>
+typename std::enable_if<(i > 1), std::ostream&>::type operator<<(
+    std::ostream& os, const Dim<i>& d) {
+  os << d.head << ", " << d.tail;
+  return os;
+}
+
+// Base case that allows us to output a Dim
+// XXX I wish this could be an overload instead of a template
+template <int i>
+typename std::enable_if<(i == 1), std::ostream&>::type operator<<(
+    std::ostream& os, const Dim<i>& d) {
+  os << d.head;
+  return os;
+}
+
+template <int i>
+HOST std::string Dim<i>::to_string() const {
+  std::stringstream stream;
+
+  stream << *this;
+
+  return stream.str();
+}
+
+template <int D>
+HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
+  Dim<D> result;
+
+  for (int i = 0; i < D - 1; ++i) {
+    result[i] = linear_index % extents[i];
+    linear_index /= extents[i];
+  }
+
+  result[D - 1] = linear_index;
+
+  return result;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/dim_test.cu b/paddle/fluid/framework/dim_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0f1969d79775ba70661806d589ae3de2696b77e8
--- /dev/null
+++ b/paddle/fluid/framework/dim_test.cu
@@ -0,0 +1,114 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <thrust/device_vector.h>
+#include <sstream>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/dim.h"
+
+__global__ void test(paddle::framework::Dim<2>* o) {
+  o[0] = paddle::framework::make_dim(5, 6);
+}
+
+__global__ void dyn_idx_gpu(int64_t* o) {
+  auto d = paddle::framework::make_dim(5, 6);
+  o[0] = d[1];
+}
+
+TEST(Dim, Equality) {
+  // construct a Dim on the CPU
+  auto a = paddle::framework::make_dim(3, 4);
+  EXPECT_EQ(paddle::framework::get<0>(a), 3);
+  EXPECT_EQ(paddle::framework::get<1>(a), 4);
+
+  // construct a Dim on the GPU
+  thrust::device_vector<paddle::framework::Dim<2>> t(2);
+  test<<<1, 1>>>(thrust::raw_pointer_cast(t.data()));
+  a = t[0];
+  EXPECT_EQ(paddle::framework::get<0>(a), 5);
+  EXPECT_EQ(paddle::framework::get<1>(a), 6);
+
+  // linearization
+  auto b = paddle::framework::make_dim(7, 8);
+  EXPECT_EQ(paddle::framework::linearize(a, b), 83);
+
+  // product
+  EXPECT_EQ(paddle::framework::product(a), 30);
+
+  // mutate a Dim
+  paddle::framework::get<1>(b) = 10;
+  EXPECT_EQ(paddle::framework::get<0>(b), 7);
+  EXPECT_EQ(paddle::framework::get<1>(b), 10);
+
+  // dynamic access
+  paddle::framework::get(b, 0) = 8;
+  b[1] = 11;
+  EXPECT_EQ(paddle::framework::get<0>(b), 8);
+  EXPECT_EQ(paddle::framework::get<1>(b), 11);
+  EXPECT_EQ(paddle::framework::get(b, 0), 8);
+  EXPECT_EQ(b[1], 11);
+
+  // dynamic access on GPU
+  thrust::device_vector<int64_t> r(1);
+  dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
+  int64_t res = r[0];
+  EXPECT_EQ(res, 6);
+
+  // ex_prefix_mul
+  paddle::framework::Dim<3> c =
+      paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
+  EXPECT_EQ(paddle::framework::get<0>(c), 1);
+  EXPECT_EQ(paddle::framework::get<1>(c), 3);
+  EXPECT_EQ(paddle::framework::get<2>(c), 12);
+
+  // generate from an index
+  auto size = paddle::framework::make_dim(4, 5, 2);
+  c = paddle::framework::Dim<3>(14, size);
+  EXPECT_EQ(paddle::framework::get<0>(c), 2);
+  EXPECT_EQ(paddle::framework::get<1>(c), 3);
+  EXPECT_EQ(paddle::framework::get<2>(c), 0);
+  c = paddle::framework::Dim<3>(25, size);
+  EXPECT_EQ(paddle::framework::get<0>(c), 1);
+  EXPECT_EQ(paddle::framework::get<1>(c), 1);
+  EXPECT_EQ(paddle::framework::get<2>(c), 1);
+}
+
+TEST(Dim, Bool) {
+  auto a = paddle::framework::make_dim(3, 4);
+  auto b = paddle::framework::make_dim(5, 6);
+  auto c = paddle::framework::make_dim(3, 4);
+
+  // in_bounds check
+  EXPECT_TRUE(paddle::framework::contained(a, b));
+  EXPECT_FALSE(paddle::framework::contained(b, a));
+
+  // comparison
+  EXPECT_TRUE(a == a);
+  EXPECT_FALSE(a == b);
+  EXPECT_TRUE(a == c);
+}
+
+TEST(Dim, Print) {
+  {
+    std::stringstream ss;
+    auto a = paddle::framework::make_dim(2, 3);
+    ss << a;
+    EXPECT_EQ(ss.str(), "2, 3");
+  }
+  {
+    std::stringstream ss;
+    ss << paddle::framework::make_dim(8);
+    EXPECT_EQ(ss.str(), "8");
+  }
+}
diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1b8c701a7941813c0fbf441b8a6c7f4d3811a6d
--- /dev/null
+++ b/paddle/fluid/framework/eigen.h
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/tensor.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace framework {
+
+// EigenDim converts paddle::platform::DDim into Eigen::DSizes.
+template <int D>
+struct EigenDim {
+  using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
+
+  static Type From(const DDim& dims) {
+    PADDLE_ENFORCE(arity(dims) == D, "D must match arity(DDim)");
+    Type ret;
+    for (int64_t d = 0; d < arity(dims); d++) {
+      ret[d] = dims[d];
+    }
+    return ret;
+  }
+};
+
+// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor.
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenTensor {
+  // TODO(qijun) Now, default type in unaligned, and we will make a benchmark on
+  // the speed of aligned and unaligned version in future.
+  using Type = Eigen::TensorMap<Eigen::Tensor<T, D, MajorType, IndexType>>;
+
+  using ConstType =
+      Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
+
+  static Type From(Tensor& tensor, DDim dims) {
+    return Type(tensor.data<T>(), EigenDim<D>::From(dims));
+  }
+
+  static Type From(Tensor& tensor) { return From(tensor, tensor.dims_); }
+
+  static ConstType From(const Tensor& tensor, DDim dims) {
+    return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
+  }
+
+  static ConstType From(const Tensor& tensor) {
+    return From(tensor, tensor.dims_);
+  }
+};
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
+  static typename EigenMatrix::Type Reshape(Tensor& tensor, int num_col_dims) {
+    int rank = tensor.dims_.size();
+    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
+                   "`num_col_dims` must be between (0, rank_of_tensor).");
+    return EigenMatrix::From(tensor,
+                             flatten_to_2d(tensor.dims(), num_col_dims));
+  }
+
+  static typename EigenMatrix::ConstType Reshape(const Tensor& tensor,
+                                                 int num_col_dims) {
+    int rank = tensor.dims_.size();
+    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
+                   "`num_col_dims` must be between (0, rank_of_tensor).");
+    return EigenMatrix::From(tensor,
+                             flatten_to_2d(tensor.dims(), num_col_dims));
+  }
+};
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
+  // Flatten reshapes a Tensor into an EigenVector.
+  static typename EigenVector::Type Flatten(Tensor& tensor) {
+    return EigenVector::From(tensor, {product(tensor.dims_)});
+  }
+
+  static typename EigenVector::ConstType Flatten(const Tensor& tensor) {
+    return EigenVector::From(tensor, {product(tensor.dims_)});
+  }
+};
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenScalar {
+  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
+  using Type = Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, MajorType, IndexType>>;
+  using ConstType = Eigen::TensorMap<
+      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, MajorType, IndexType>>;
+
+  static Type From(Tensor& tensor) { return Type(tensor.data<T>()); }
+
+  static ConstType From(const Tensor& tensor) {
+    return ConstType(tensor.data<T>());
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/eigen_test.cc b/paddle/fluid/framework/eigen_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f9e3abeccb34fd1778cac6918d24e30021b433e9
--- /dev/null
+++ b/paddle/fluid/framework/eigen_test.cc
@@ -0,0 +1,132 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/eigen.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+
+TEST(EigenDim, From) {
+  EigenDim<3>::Type ed = EigenDim<3>::From(make_ddim({1, 2, 3}));
+  ASSERT_EQ(1, ed[0]);
+  ASSERT_EQ(2, ed[1]);
+  ASSERT_EQ(3, ed[2]);
+}
+
+TEST(Eigen, Tensor) {
+  Tensor t;
+  float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
+  for (int i = 0; i < 1 * 2 * 3; i++) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
+
+  ASSERT_EQ(1, et.dimension(0));
+  ASSERT_EQ(2, et.dimension(1));
+  ASSERT_EQ(3, et.dimension(2));
+
+  for (int i = 0; i < 1; i++) {
+    for (int j = 0; j < 2; j++) {
+      for (int k = 0; k < 3; k++) {
+        ASSERT_NEAR((i * 2 + j) * 3 + k, et(i, j, k), 1e-6f);
+      }
+    }
+  }
+}
+
+TEST(Eigen, ScalarFrom) {
+  Tensor t;
+  int* p = t.mutable_data<int>(make_ddim({1}), platform::CPUPlace());
+  *p = static_cast<int>(100);
+
+  EigenScalar<int>::Type es = EigenScalar<int>::From(t);
+
+  ASSERT_EQ(0, es.dimension(0));
+  ASSERT_EQ(100, es(0));
+}
+
+TEST(Eigen, VectorFrom) {
+  Tensor t;
+  float* p = t.mutable_data<float>(make_ddim({6}), platform::CPUPlace());
+  for (int i = 0; i < 6; i++) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenVector<float>::Type ev = EigenVector<float>::From(t);
+
+  ASSERT_EQ(6, ev.dimension(0));
+
+  for (int i = 0; i < 6; i++) {
+    ASSERT_NEAR(i, ev(i), 1e-6f);
+  }
+}
+
+TEST(Eigen, VectorFlatten) {
+  Tensor t;
+  float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
+  for (int i = 0; i < 1 * 2 * 3; i++) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenVector<float>::Type ev = EigenVector<float>::Flatten(t);
+
+  ASSERT_EQ(1 * 2 * 3, ev.dimension(0));
+
+  for (int i = 0; i < 1 * 2 * 3; i++) {
+    ASSERT_NEAR(i, ev(i), 1e-6f);
+  }
+}
+
+TEST(Eigen, Matrix) {
+  Tensor t;
+  float* p = t.mutable_data<float>(make_ddim({2, 3}), platform::CPUPlace());
+  for (int i = 0; i < 2 * 3; i++) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenMatrix<float>::Type em = EigenMatrix<float>::From(t);
+
+  ASSERT_EQ(2, em.dimension(0));
+  ASSERT_EQ(3, em.dimension(1));
+
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 3; j++) {
+      ASSERT_NEAR(i * 3 + j, em(i, j), 1e-6f);
+    }
+  }
+}
+
+TEST(Eigen, MatrixReshape) {
+  Tensor t;
+  float* p = t.mutable_data<float>({2, 3, 6, 4}, platform::CPUPlace());
+  for (int i = 0; i < 2 * 3 * 6 * 4; ++i) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenMatrix<float>::Type em = EigenMatrix<float>::Reshape(t, 2);
+
+  ASSERT_EQ(2 * 3, em.dimension(0));
+  ASSERT_EQ(6 * 4, em.dimension(1));
+
+  for (int i = 0; i < 2 * 3; i++) {
+    for (int j = 0; j < 6 * 4; j++) {
+      ASSERT_NEAR(i * 6 * 4 + j, em(i, j), 1e-6f);
+    }
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e1358ab0e91c26e3df7f39a12c064691f794e0b
--- /dev/null
+++ b/paddle/fluid/framework/executor.cc
@@ -0,0 +1,315 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/executor.h"
+
+#include <set>
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"  // platform::Communicator
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DECLARE_bool(benchmark);
+DEFINE_bool(check_nan_inf, false,
+            "Checking whether operator produce NAN/INF or not. It will be "
+            "extremely slow so please use this flag wisely.");
+
+namespace paddle {
+namespace framework {
+
+Executor::Executor(const platform::Place& place) : place_(place) {}
+
+static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
+  if (var_type == proto::VarDesc::LOD_TENSOR) {
+    var->GetMutable<LoDTensor>();
+  } else if (var_type == proto::VarDesc::SELECTED_ROWS) {
+    var->GetMutable<SelectedRows>();
+  } else if (var_type == proto::VarDesc::FEED_MINIBATCH) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarDesc::FETCH_LIST) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarDesc::STEP_SCOPES) {
+    var->GetMutable<std::vector<framework::Scope>>();
+  } else if (var_type == proto::VarDesc::LOD_RANK_TABLE) {
+    var->GetMutable<LoDRankTable>();
+  } else if (var_type == proto::VarDesc::LOD_TENSOR_ARRAY) {
+    var->GetMutable<LoDTensorArray>();
+  } else if (var_type == proto::VarDesc::PLACE_LIST) {
+    var->GetMutable<platform::PlaceList>();
+  } else if (var_type == proto::VarDesc::NCCL_COM) {
+    var->GetMutable<platform::Communicator>();
+  } else if (var_type == proto::VarDesc::READER) {
+    var->GetMutable<ReaderHolder>();
+  } else {
+    PADDLE_THROW(
+        "Variable type %d is not in "
+        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
+        "LOD_RANK_TABLE, PLACE_LIST, READER]",
+        var_type);
+  }
+}
+
+static void CheckTensorNANOrInf(const std::string& name,
+                                const framework::Tensor& tensor) {
+  if (tensor.memory_size() == 0) {
+    return;
+  }
+  if (tensor.type().hash_code() != typeid(float).hash_code() &&
+      tensor.type().hash_code() != typeid(double).hash_code()) {
+    return;
+  }
+  PADDLE_ENFORCE(!framework::HasInf(tensor), "Tensor %s has Inf", name);
+  PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN", name);
+}
+
+void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
+                   bool create_local_scope, bool create_vars) {
+  // TODO(tonyyang-svail):
+  //    - only runs on the first device (i.e. no interdevice communication)
+  //    - will change to use multiple blocks for RNN op and Cond Op
+  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), pdesc.Size());
+  auto& block = pdesc.Block(block_id);
+
+  Scope* local_scope = scope;
+  if (create_vars) {
+    if (create_local_scope) {
+      local_scope = &scope->NewScope();
+      for (auto& var : block.AllVars()) {
+        if (var->Name() == framework::kEmptyVarName) {
+          continue;
+        }
+
+        if (var->Persistable()) {
+          auto* ptr = scope->Var(var->Name());
+          CreateTensor(ptr, var->GetType());
+          VLOG(3) << "Create Variable " << var->Name()
+                  << " global, which pointer is " << ptr;
+        } else {
+          auto* ptr = local_scope->Var(var->Name());
+          CreateTensor(ptr, var->GetType());
+          VLOG(3) << "Create Variable " << var->Name()
+                  << " locally, which pointer is " << ptr;
+        }
+      }
+    } else {
+      for (auto& var : block.AllVars()) {
+        auto* ptr = local_scope->Var(var->Name());
+        CreateTensor(ptr, var->GetType());
+        VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
+                << ptr;
+      }
+    }  // if (create_local_scope)
+  }    // if (create_vars)
+
+  for (auto& op_desc : block.AllOps()) {
+    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
+    VLOG(3) << op->DebugStringEx(local_scope);
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    platform::RecordEvent record_event(op->Type(), pool.Get(place_));
+
+    op->Run(*local_scope, place_);
+    if (FLAGS_benchmark) {
+      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
+              << memory::memory_usage(place_);
+    }
+    if (FLAGS_check_nan_inf) {
+      for (auto& vname : op->OutputVars(true)) {
+        auto* var = local_scope->FindVar(vname);
+        if (var == nullptr) continue;
+        if (var->IsType<framework::LoDTensor>()) {
+          CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+        }
+      }
+    }
+  }
+  if (create_vars && create_local_scope) {
+    scope->DeleteScope(local_scope);
+  }
+  if (FLAGS_benchmark) {
+    VLOG(2) << "-------------------------------------------------------";
+    VLOG(2) << "Memory used after deleting local scope: "
+            << memory::memory_usage(place_);
+    VLOG(2) << "-------------------------------------------------------";
+  }
+}
+
+// Check whether the block already has feed operators and feed_holder.
+// Return false if the block does not have any feed operators.
+// If some feed operators have been prepended to the block, check that
+// the info contained in these feed operators matches the feed_targets
+// and feed_holder_name. Raise exception when any mismatch is found.
+// Return true if the block has feed operators and holder of matching info.
+static bool has_feed_operators(
+    BlockDesc* block, std::map<std::string, const LoDTensor*>& feed_targets,
+    const std::string& feed_holder_name) {
+  size_t feed_count = 0;
+  for (auto* op : block->AllOps()) {
+    if (op->Type() == kFeedOpType) {
+      feed_count++;
+      PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name,
+                        "Input to feed op should be '%s'", feed_holder_name);
+      std::string feed_target_name = op->Output("Out")[0];
+      PADDLE_ENFORCE(
+          feed_targets.find(feed_target_name) != feed_targets.end(),
+          "Feed operator output name '%s' cannot be found in 'feed_targets'",
+          feed_target_name);
+    }
+  }
+
+  if (feed_count > 0) {
+    PADDLE_ENFORCE_EQ(
+        feed_count, feed_targets.size(),
+        "The number of feed operators should match 'feed_targets'");
+
+    // When feed operator are present, so should be feed_holder
+    auto var = block->FindVar(feed_holder_name);
+    PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
+                            feed_holder_name);
+    PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FEED_MINIBATCH,
+                      "'%s' variable should be 'FEED_MINIBATCH' type",
+                      feed_holder_name);
+  }
+
+  return feed_count > 0;
+}
+
+// Check whether the block already has fetch operators and fetch_holder.
+// Return false if the block does not have any fetch operators.
+// If some fetch operators have been appended to the block, check that
+// the info contained in these fetch operators matches the fetch_targets
+// and fetch_holder_name. Raise exception when any mismatch is found.
+// Return true if the block has fetch operators and holder of matching info.
+static bool has_fetch_operators(
+    BlockDesc* block, std::map<std::string, LoDTensor*>& fetch_targets,
+    const std::string& fetch_holder_name) {
+  size_t fetch_count = 0;
+  for (auto* op : block->AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      fetch_count++;
+      PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name,
+                        "Output of fetch op should be '%s'", fetch_holder_name);
+      std::string fetch_target_name = op->Input("X")[0];
+      PADDLE_ENFORCE(
+          fetch_targets.find(fetch_target_name) != fetch_targets.end(),
+          "Fetch operator input name '%s' cannot be found in 'fetch_targets'",
+          fetch_target_name);
+    }
+  }
+
+  if (fetch_count > 0) {
+    PADDLE_ENFORCE_EQ(
+        fetch_count, fetch_targets.size(),
+        "The number of fetch operators should match 'fetch_targets'");
+
+    // When fetch operator are present, so should be fetch_holder
+    auto var = block->FindVar(fetch_holder_name);
+    PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
+                            fetch_holder_name);
+    PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FETCH_LIST,
+                      "'%s' variable should be 'FETCH_LIST' type",
+                      fetch_holder_name);
+  }
+
+  return fetch_count > 0;
+}
+
+void Executor::Run(const ProgramDesc& program, Scope* scope,
+                   std::map<std::string, const LoDTensor*>& feed_targets,
+                   std::map<std::string, LoDTensor*>& fetch_targets,
+                   const std::string& feed_holder_name,
+                   const std::string& fetch_holder_name) {
+  auto* copy_program = new ProgramDesc(program);
+  auto* global_block = copy_program->MutableBlock(0);
+
+  if (!has_feed_operators(global_block, feed_targets, feed_holder_name)) {
+    // create feed_holder variable
+    auto* feed_holder = global_block->Var(feed_holder_name);
+    feed_holder->SetType(proto::VarDesc::FEED_MINIBATCH);
+    feed_holder->SetPersistable(true);
+
+    int i = 0;
+    for (auto& feed_target : feed_targets) {
+      std::string var_name = feed_target.first;
+      VLOG(3) << "feed target's name: " << var_name;
+
+      // prepend feed op
+      auto* op = global_block->PrependOp();
+      op->SetType(kFeedOpType);
+      op->SetInput("X", {feed_holder_name});
+      op->SetOutput("Out", {var_name});
+      op->SetAttr("col", {static_cast<int>(i)});
+      op->CheckAttrs();
+
+      i++;
+    }
+  }
+
+  // map the data of feed_targets to feed_holder
+  for (auto* op : global_block->AllOps()) {
+    if (op->Type() == kFeedOpType) {
+      std::string feed_target_name = op->Output("Out")[0];
+      int idx = boost::get<int>(op->GetAttr("col"));
+      SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
+                      idx);
+    }
+  }
+
+  if (!has_fetch_operators(global_block, fetch_targets, fetch_holder_name)) {
+    // create fetch_holder variable
+    auto* fetch_holder = global_block->Var(fetch_holder_name);
+    fetch_holder->SetType(proto::VarDesc::FETCH_LIST);
+    fetch_holder->SetPersistable(true);
+
+    int i = 0;
+    for (auto& fetch_target : fetch_targets) {
+      std::string var_name = fetch_target.first;
+      VLOG(3) << "fetch target's name: " << var_name;
+
+      // append fetch op
+      auto* op = global_block->AppendOp();
+      op->SetType(kFetchOpType);
+      op->SetInput("X", {var_name});
+      op->SetOutput("Out", {fetch_holder_name});
+      op->SetAttr("col", {static_cast<int>(i)});
+      op->CheckAttrs();
+
+      i++;
+    }
+  }
+
+  Run(*copy_program, scope, 0, true, true);
+
+  // obtain the data of fetch_targets from fetch_holder
+  for (auto* op : global_block->AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      std::string fetch_target_name = op->Input("X")[0];
+      int idx = boost::get<int>(op->GetAttr("col"));
+      *fetch_targets[fetch_target_name] =
+          GetFetchVariable(*scope, fetch_holder_name, idx);
+    }
+  }
+
+  delete copy_program;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..893c949939e8db2f5227b940bf721ca7f114db9c
--- /dev/null
+++ b/paddle/fluid/framework/executor.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+class Executor {
+ public:
+  // TODO(dzhwinter) : Do not rely on this function, it will be removed
+  explicit Executor(const platform::DeviceContext& device)
+      : Executor(device.GetPlace()) {}
+
+  explicit Executor(const platform::Place& place);
+
+  /* @Brief
+   * Runtime evaluation of the given ProgramDesc under certain Scope
+   *
+   * @param
+   *  ProgramDesc
+   *  Scope
+   */
+  void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true,
+           bool create_vars = true);
+
+  void Run(const ProgramDesc& program, Scope* scope,
+           std::map<std::string, const LoDTensor*>& feed_targets,
+           std::map<std::string, LoDTensor*>& fetch_targets,
+           const std::string& feed_holder_name = "feed",
+           const std::string& fetch_holder_name = "fetch");
+
+ private:
+  const platform::Place place_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a9bb17355d9bc1fb7f355a4038a4c5831d3530b1
--- /dev/null
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "glog/logging.h"
+#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+
+void SetFeedVariable(Scope* scope, const LoDTensor& input,
+                     const std::string& var_name, size_t index) {
+  // If var_name Variable is not found in GlobalScope, a new variable will
+  // be created.
+  VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
+  Variable* g_feed_value = scope->Var(var_name);
+  auto& feed_inputs =
+      *(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
+  if (index >= feed_inputs.size()) {
+    feed_inputs.resize(index + 1);
+  }
+  // shared data with input tensor
+  feed_inputs[index].ShareDataWith(input);
+  // set lod
+  feed_inputs[index].set_lod(input.lod());
+}
+
+LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
+                            size_t index) {
+  // Since we want to fetch LodTensor from a variable, the variable must
+  // be created alreadly.
+  Variable* g_fetch_value = scope.FindVar(var_name);
+  PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
+                 "Only %s can be invoked by GetFetchVariable",
+                 typeid(FeedFetchList).name());
+  auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
+  auto& tensor = fetch_outputs[index];
+  VLOG(3) << "Fetch " << var_name << " with index " << index
+          << " shape= " << tensor.dims();
+  PADDLE_ENFORCE_LT(index, fetch_outputs.size());
+  return tensor;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h
new file mode 100644
index 0000000000000000000000000000000000000000..5355c29047e668d1a2ec141b303dd562158b2bb3
--- /dev/null
+++ b/paddle/fluid/framework/feed_fetch_method.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+
+void SetFeedVariable(Scope* scope, const LoDTensor& input,
+                     const std::string& var_name, size_t index);
+
+LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
+                            size_t index);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..4281e36b138f66268e2f1e835d4475676d97839d
--- /dev/null
+++ b/paddle/fluid/framework/feed_fetch_type.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+using FeedFetchType = LoDTensor;
+using FeedFetchList = std::vector<FeedFetchType>;
+
+static const std::string kFeedOpType = "feed";
+static const std::string kFetchOpType = "fetch";
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/framework.proto b/paddle/fluid/framework/framework.proto
similarity index 100%
rename from paddle/framework/framework.proto
rename to paddle/fluid/framework/framework.proto
diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h
new file mode 100644
index 0000000000000000000000000000000000000000..21dd4e885485f88eeeb034ca45c643f9fadf3163
--- /dev/null
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
@@ -0,0 +1,195 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+
+/*
+  This functor class is responsible for creating the gradient ops for the given
+  operator fwd_op. After it is called (through operator()), the pairs of
+  (gradient variable, corresponding input variable of fwd_op) will be added to
+  grad_to_var. If an input variable of fwd_op is contained in no_grad_set, its
+  gradient varialbe will be ignored or kEmptyVarName depending on the template
+  argument DropEmptyIG in the derived classes.
+ */
+class GradOpDescMakerBase {
+ public:
+  explicit GradOpDescMakerBase(
+      const OpDesc& fwd_op, const std::unordered_set<std::string>& no_grad_set,
+      std::unordered_map<std::string, std::string>* grad_to_var,
+      const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>())
+      : fwd_op_(fwd_op),
+        no_grad_set_(no_grad_set),
+        grad_to_var_(grad_to_var),
+        grad_block_(grad_block) {}
+
+  virtual ~GradOpDescMakerBase() = default;
+  virtual std::vector<std::unique_ptr<OpDesc>> operator()() const = 0;
+
+ protected:
+  std::vector<std::string> InputGrad(const std::string& name,
+                                     bool drop_empty_grad = true) const {
+    std::vector<std::string> ret_val;
+    auto var_names = this->Input(name);
+    ret_val.reserve(var_names.size());
+    std::transform(var_names.begin(), var_names.end(),
+                   std::back_inserter(ret_val),
+                   [this](const std::string& fwd_var_name) -> std::string {
+                     auto g_name = GradVarName(fwd_var_name);
+                     if (no_grad_set_.count(g_name)) {
+                       return kEmptyVarName;
+                     } else {
+                       (*this->grad_to_var_)[g_name] = fwd_var_name;
+                       return g_name;
+                     }
+                   });
+    if (!drop_empty_grad) {
+      return ret_val;
+    }
+    PADDLE_ENFORCE_LE(var_names.size(), 1UL,
+                      "BUG from operator developer:"
+                      " for input argument with a list of variables, "
+                      " drop_empty_grad is not allowed because it makes"
+                      " the correspondence bewteen a variable and its gradient"
+                      " ambiguous. Use REGISTER_OP_EX to register the op"
+                      " or call InputGrad(?,false) in GradOpDescMaker."
+                      " Op type %s",
+                      fwd_op_.Type());
+
+    std::vector<std::string> dropped_ret_val;
+    dropped_ret_val.reserve(ret_val.size());
+    std::copy_if(ret_val.begin(), ret_val.end(),
+                 std::back_inserter(dropped_ret_val),
+                 [](const std::string& str) { return str != kEmptyVarName; });
+    return dropped_ret_val;
+  }
+
+  std::vector<std::string> OutputGrad(const std::string& name) const {
+    std::vector<std::string> ret_val;
+    auto onames = this->Output(name);
+    ret_val.reserve(onames.size());
+    std::transform(onames.begin(), onames.end(), std::back_inserter(ret_val),
+                   [this](const std::string& fwd_var_name) -> std::string {
+                     auto g_name = GradVarName(fwd_var_name);
+                     (*this->grad_to_var_)[g_name] = fwd_var_name;
+                     return g_name;
+                   });
+    return ret_val;
+  }
+
+  std::vector<std::string> InputNames() const {
+    return this->fwd_op_.InputNames();
+  }
+
+  std::vector<std::string> OutputNames() const {
+    return this->fwd_op_.OutputNames();
+  }
+
+  std::vector<std::string> Input(const std::string& name) const {
+    return fwd_op_.Input(name);
+  }
+
+  std::vector<std::string> Output(const std::string& name) const {
+    return fwd_op_.Output(name);
+  }
+
+  const std::unordered_map<std::string, Attribute>& Attrs() const {
+    return fwd_op_.GetAttrMap();
+  }
+
+  const Attribute& GetAttr(const std::string& name) const {
+    auto& map = fwd_op_.GetAttrMap();
+    auto it = map.find(name);
+    PADDLE_ENFORCE(it != map.end(), "Cannot find attribute %s", name);
+    return it->second;
+  }
+
+  template <typename T>
+  inline const T& Attr(const std::string& name) const {
+    return boost::get<T>(GetAttr(name));
+  }
+
+  std::string ForwardOpType() const { return this->fwd_op_.Type(); }
+
+ private:
+  const OpDesc& fwd_op_;
+  const std::unordered_set<std::string>& no_grad_set_;
+  std::unordered_map<std::string, std::string>* grad_to_var_;
+
+ protected:
+  std::vector<BlockDesc*> grad_block_;
+};
+
+class SingleGradOpDescMaker : public GradOpDescMakerBase {
+ public:
+  using GradOpDescMakerBase::GradOpDescMakerBase;
+
+  std::vector<std::unique_ptr<OpDesc>> operator()() const {
+    std::vector<std::unique_ptr<OpDesc>> retv;
+    retv.emplace_back(this->Apply());
+    return retv;
+  }
+
+ protected:
+  virtual std::unique_ptr<OpDesc> Apply() const = 0;
+};
+
+template <bool DropEmptyIG = true>
+class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
+ public:
+  using SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  virtual std::unique_ptr<OpDesc> Apply() const {
+    auto* grad = new OpDesc();
+    grad->SetType(this->GradOpType());
+
+    for (auto& input_param : this->InputNames()) {
+      grad->SetInput(input_param, this->Input(input_param));
+      grad->SetOutput(GradVarName(input_param),
+                      this->InputGrad(input_param, DropEmptyIG));
+    }
+
+    for (auto& output_param : this->OutputNames()) {
+      grad->SetInput(output_param, this->Output(output_param));
+      grad->SetInput(GradVarName(output_param), this->OutputGrad(output_param));
+    }
+
+    grad->SetAttrMap(this->Attrs());
+
+    return std::unique_ptr<OpDesc>(grad);
+  }
+
+  virtual std::string GradOpType() const {
+    return this->ForwardOpType() + "_grad";
+  }
+};
+
+class EmptyGradOpMaker : public GradOpDescMakerBase {
+ public:
+  using GradOpDescMakerBase::GradOpDescMakerBase;
+  std::vector<std::unique_ptr<OpDesc>> operator()() const override {
+    return {};
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/init.cc b/paddle/fluid/framework/init.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb2d740d8609210064a06cccc5d45c84275e9709
--- /dev/null
+++ b/paddle/fluid/framework/init.cc
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string.h>  // for strdup
+#include <algorithm>
+#include <stdexcept>
+#include <string>
+
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/string/piece.h"
+
+namespace paddle {
+namespace framework {
+
+std::once_flag gflags_init_flag;
+
+void InitGflags(std::vector<std::string> &argv) {
+  std::call_once(gflags_init_flag, [&]() {
+    int argc = argv.size();
+    char **arr = new char *[argv.size()];
+    std::string line;
+    for (size_t i = 0; i < argv.size(); i++) {
+      arr[i] = &argv[i][0];
+      line += argv[i];
+      line += ' ';
+    }
+    google::ParseCommandLineFlags(&argc, &arr, true);
+    VLOG(1) << "Init commandline: " << line;
+  });
+}
+
+void InitDevices() {
+  /*Init all avaiable devices by default */
+
+  std::vector<platform::Place> places;
+  places.emplace_back(platform::CPUPlace());
+  int count = 0;
+
+#ifdef PADDLE_WITH_CUDA
+  try {
+    count = platform::GetCUDADeviceCount();
+  } catch (const std::exception &exp) {
+    LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
+  }
+#else
+  LOG(WARNING)
+      << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
+#endif
+
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(platform::CUDAPlace(i));
+  }
+
+  platform::DeviceContextPool::Init(places);
+}
+
+void InitGLOG(const std::string &prog_name) {
+  // glog will not hold the ARGV[0] inside.
+  // Use strdup to alloc a new string.
+  google::InitGoogleLogging(strdup(prog_name.c_str()));
+  google::InstallFailureSignalHandler();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/init.h b/paddle/fluid/framework/init.h
similarity index 100%
rename from paddle/framework/init.h
rename to paddle/fluid/framework/init.h
diff --git a/paddle/fluid/framework/init_test.cc b/paddle/fluid/framework/init_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f3018541e27a403e1b6e63a8da9eeb1f67915e9e
--- /dev/null
+++ b/paddle/fluid/framework/init_test.cc
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/platform/device_context.h"
+
+TEST(InitDevices, CPU) {
+  using paddle::framework::InitDevices;
+  using paddle::platform::DeviceContextPool;
+
+#ifndef PADDLE_WITH_CUDA
+  InitDevices();
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  ASSERT_EQ(pool.size(), 1U);
+#endif
+}
+
+TEST(InitDevices, CUDA) {
+  using paddle::framework::InitDevices;
+  using paddle::platform::DeviceContextPool;
+
+#ifdef PADDLE_WITH_CUDA
+  int count = paddle::platform::GetCUDADeviceCount();
+  InitDevices();
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
+#endif
+}
diff --git a/paddle/framework/library_type.h b/paddle/fluid/framework/library_type.h
similarity index 100%
rename from paddle/framework/library_type.h
rename to paddle/fluid/framework/library_type.h
diff --git a/paddle/fluid/framework/lod_rank_table.cc b/paddle/fluid/framework/lod_rank_table.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31c87492349bff4cd81b101a0e8d44b0516bac46
--- /dev/null
+++ b/paddle/fluid/framework/lod_rank_table.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/lod_rank_table.h"
+
+namespace paddle {
+namespace framework {
+void LoDRankTable::Reset(const LoD& lod, size_t level) {
+  this->coarse_lod_.clear();
+  this->items_.clear();
+  PADDLE_ENFORCE(level < lod.size(),
+                 "Cannot rank lod since the level %d is less than lod size %d",
+                 level, lod.size());
+  coarse_lod_.reserve(level);
+  for (size_t i = 0; i < level; ++i) {
+    coarse_lod_.push_back(lod[i]);
+  }
+  auto& vec = lod[level];
+  for (size_t i = 0; i < vec.size() - 1; ++i) {
+    TableItem item;
+    item.index = i;
+    item.length = vec[i + 1] - vec[i];
+    VLOG(10) << "Add item to rank table " << item.index << " " << item.length;
+    items_.emplace_back(item);
+  }
+  // NOTE(yuyang18):
+  //
+  // The time complexity of stable_sort is O(N*log(N)) if additional memory is
+  // available. It is easy to debug and unit test when using `stable_sort`
+  // instead of `sort`. Also, the items of a rank table will not be too large.
+  std::stable_sort(items_.begin(), items_.end(),
+                   [](const TableItem& a, const TableItem& b) {
+                     return a.length > b.length;
+                   });
+}
+
+}  // namespace framework
+
+std::ostream& operator<<(std::ostream& out,
+                         const framework::LoDRankTable& table) {
+  out << "NumOfSequence " << table.items().size() << "\n";
+  for (auto& each_item : table.items()) {
+    out << "\tSeq #" << each_item.index << ", Len=" << each_item.length << "\n";
+  }
+  return out;
+}
+}  // namespace paddle
diff --git a/paddle/fluid/framework/lod_rank_table.h b/paddle/fluid/framework/lod_rank_table.h
new file mode 100644
index 0000000000000000000000000000000000000000..0eaaf49e4c4d90250b247edf2a8699b8c7c5920d
--- /dev/null
+++ b/paddle/fluid/framework/lod_rank_table.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <iosfwd>
+#include "paddle/fluid/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+
+// LoD Rank Table stores the `level` of `lod` which is ordered by sequence
+// length in descending order. It is useful when implement dynamic RNN and is
+// shared by dynamic RNN memory, dynamic RNN slice input and dynamic RNN slice
+// output operators.
+//
+// The table item contains two element. The length of sequence and the index of
+// sequence in that level.
+//
+// LoDRankTable also stores the coarse_lod, which is the lod information whose
+// level is less than input level, in order to restore the output LoD
+// information.
+class LoDRankTable {
+ public:
+  struct TableItem {
+    size_t index;
+    size_t length;
+  };
+
+  LoDRankTable() {}
+
+  void Reset(const LoD& lod, size_t level);
+
+  const std::vector<TableItem>& items() const { return this->items_; }
+
+  const LoD& coarse_lod() const { return this->coarse_lod_; }
+
+  size_t level() const { return coarse_lod_.size(); }
+
+ private:
+  LoD coarse_lod_;
+  std::vector<TableItem> items_;
+};
+
+}  // namespace framework
+
+std::ostream& operator<<(std::ostream& out,
+                         const framework::LoDRankTable& table);
+
+}  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..05c67e453d0b8c84aaa8b72ec314153791c73e8f
--- /dev/null
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -0,0 +1,378 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/memory/memory.h"
+
+#include <stdint.h>
+#include <string.h>
+#include <algorithm>
+#include <iterator>
+
+namespace paddle {
+namespace framework {
+
+std::ostream &operator<<(std::ostream &os, const LoD &lod) {
+  os << "{";
+  for (auto &v : lod) {
+    os << "{";
+    for (auto &i : v) {
+      os << i << ",";
+    }
+    os << "}";
+  }
+  os << "}";
+
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
+  PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code());
+
+  if (!platform::is_cpu_place(t.place())) {
+    LoDTensor tt;
+    framework::Copy(t, platform::CPUPlace(), &tt);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(t.place());
+    dev_ctx.Wait();
+
+    os << tt;
+    return os;
+  }
+
+  os << "dim: " << t.dims() << "\n";
+  os << "lod: " << t.lod() << "\n";
+
+  // only print first ten elements
+  int64_t size = t.numel() < 10 ? t.numel() : 10;
+  for (int64_t i = 0; i < size; ++i) {
+    os << t.data<float>()[i] << " ";
+  }
+
+  return os;
+}
+
+std::string LoDToString(const LoD &lod) {
+  std::ostringstream stream;
+  stream << lod;
+  return stream.str();
+}
+
+LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
+                 size_t elem_end) {
+  PADDLE_ENFORCE_LT(level, in.size());
+  PADDLE_ENFORCE_LT(elem_end, in[level].size());
+
+  LoD res;
+  res.resize(in.size() - level);
+  // copy the first level
+  res[0].assign(in[level].begin() + elem_begin,
+                in[level].begin() + elem_end + 1);
+  for (size_t lvl = 1; lvl < res.size(); lvl++) {
+    const auto &in_level = in[level + lvl];
+    const auto &above_level = res[lvl - 1];
+    auto &out_level = res[lvl];
+    out_level.assign(in_level.begin() + above_level.front(),
+                     in_level.begin() + above_level.back() + 1);
+  }
+  for (size_t lvl = 0; lvl < res.size(); lvl++) {
+    // to make the first offset equals 0, all the elements minus the first
+    // element
+    size_t front = res[lvl].front();
+    for (auto &ele : res[lvl]) {
+      ele -= front;
+    }
+  }
+  return res;
+}
+
+LoD ToAbsOffset(const LoD &in) {
+  // the lowest level stores relative offsets
+  if (in.empty() || in.size() == 1) return in;
+  LoD result = in;
+  for (auto level = static_cast<int>(in.size() - 2); level >= 0; level--) {
+    for (size_t i = 0; i < in[level].size(); ++i) {
+      size_t index = in[level][i];
+      result[level][i] = result[level + 1][index];
+    }
+  }
+  return result;
+}
+
+bool operator==(const LoD &a, const LoD &b) {
+  if (a.size() != b.size()) {
+    return false;
+  }
+
+  for (size_t i = 0; i < a.size(); i++) {
+    const auto &a_level = a[i];
+    const auto &b_level = b[i];
+    if (a_level.size() != b_level.size()) {
+      return false;
+    }
+    for (size_t j = 0; j < a_level.size(); j++) {
+      if (a_level[j] != b_level[j]) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool CheckLoD(const LoD &in, int tensor_height) {
+  if (in.empty()) return true;
+  for (const auto &level : in) {
+    // check: there should be more than 2 offsets existing in each level.
+    if (level.size() < 2) return false;
+    // check: the first offset(the begin offset) of each level should be 0.
+    if (level.front() != 0) return false;
+    // check: all the offsets in a level should be ascending(no same items
+    // allows).
+    if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
+          if (a < b) return true;
+          return false;
+        })) {
+      LOG(INFO) << "ascending error";
+      return false;
+    }
+  }
+  // check: the lowest level's last offset should equals `tensor_height` if
+  //        tensor_height>0.
+  if (tensor_height > 0 && (size_t)tensor_height != in.back().back())
+    return false;
+
+  // check: the higher level's last offset should equals the lower level's
+  // size-1.
+  // NOTE LoD store the levels from top to bottom, so the higher level goes
+  // first.
+  for (size_t level = 0; level < in.size() - 1; level++) {
+    if (in[level].back() != in[level + 1].size() - 1) return false;
+  }
+  return true;
+}
+
+bool CheckAbsLoD(const LoD &in, int tensor_height) {
+  if (in.empty()) return true;
+  for (const auto &level : in) {
+    // check: all the offsets in a level should be ascending(no same items
+    // allows).
+    if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
+          if (a < b) return true;
+          return false;
+        })) {
+      return false;
+    }
+
+    // check: there should be more than 2 offsets existing in each level.
+    if (level.size() < 2) return false;
+
+    // check: the first offset of each level should be 0, and the last should be
+    // the same(the height of underlying tensor).
+    if (level.front() != 0) return false;
+    if (tensor_height < 0) {
+      tensor_height = level.back();
+    } else if ((size_t)tensor_height != level.back()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
+                                        size_t end_idx, size_t start_level) {
+  LoD sub_lod;
+
+  for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
+    PADDLE_ENFORCE_LE(start_idx, end_idx);
+    PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size());
+    std::vector<size_t> level_lens;
+    for (size_t i = start_idx; i < end_idx; ++i) {
+      level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
+    }
+    sub_lod.emplace_back(level_lens);
+    start_idx = lod[level_idx][start_idx];
+    end_idx = lod[level_idx][end_idx];
+  }
+
+  return LoDAndOffset{sub_lod, {start_idx, end_idx}};
+}
+
+void AppendLoD(LoD *lod, const LoD &lod_length) {
+  PADDLE_ENFORCE(
+      lod->empty() || lod->size() == lod_length.size(),
+      "The lod_length should has the same size with the appended lod.");
+  if (lod->empty()) {
+    for (size_t i = 0; i < lod_length.size(); ++i) {
+      lod->emplace_back(1, 0);  // size = 1, value = 0;
+    }
+    *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
+  }
+  for (size_t i = 0; i < lod->size(); ++i) {
+    auto &level = (*lod)[i];
+    for (size_t len : lod_length[i]) {
+      level.push_back(level.back() + len);
+    }
+  }
+}
+
+void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
+                       const platform::DeviceContext &dev_ctx) {
+  {  // the 1st field, uint32_t version for LoDTensor
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char *>(&version), sizeof(version));
+  }
+  {
+    // the 2st field, LoD information
+    // uint64_t lod_level
+    // uint64_t lod_level_1 size in byte.
+    // int*     lod_level_1 data
+    // ...
+    auto lod = tensor.lod();
+    uint64_t size = lod.size();
+    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+
+    for (auto &each : lod) {
+      size = each.size() * sizeof(framework::LoD::value_type::value_type);
+      os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+      os.write(reinterpret_cast<const char *>(each.data()),
+               static_cast<std::streamsize>(size));
+    }
+  }
+  // the 3st field, Tensor
+  SerializeToStream(os, static_cast<Tensor>(tensor), dev_ctx);
+}
+
+void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
+                           const platform::DeviceContext &dev_ctx) {
+  {
+    // the 1st field, unit32_t version for LoDTensor
+    uint32_t version;
+    is.read(reinterpret_cast<char *>(&version), sizeof(version));
+    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  }
+  {
+    // the 2st field, LoD information
+    uint64_t lod_level;
+    is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+    auto &lod = *tensor->mutable_lod();
+    lod.resize(lod_level);
+    for (uint64_t i = 0; i < lod_level; ++i) {
+      uint64_t size;
+      is.read(reinterpret_cast<char *>(&size), sizeof(size));
+      std::vector<size_t> tmp(size / sizeof(size_t));
+      is.read(reinterpret_cast<char *>(tmp.data()),
+              static_cast<std::streamsize>(size));
+      lod[i] = tmp;
+    }
+  }
+  // the 3st filed, Tensor
+  DeserializeFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
+}
+
+std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
+    const std::vector<platform::Place> places) const {
+  check_memory_size();
+  int batch_size =
+      lod().empty() ? dims()[0] : static_cast<int>(lod()[0].size()) - 1;
+  size_t result_size = std::min(static_cast<size_t>(batch_size), places.size());
+  size_t remainder = batch_size % places.size();
+
+  std::vector<LoDTensor> results;
+  results.reserve(result_size);
+
+  int step_width = static_cast<int>(batch_size / result_size);
+  for (size_t i = 0; i < result_size; ++i) {
+    int begin = static_cast<int>(i * step_width);
+    int end = static_cast<int>((i + 1) * step_width);
+    if (i + 1 == places.size()) {  // last
+      end += remainder;
+    }
+
+    LoDTensor dst;
+    if (lod().empty()) {
+      auto src = Slice(begin, end);
+      auto &dst_place = places[i];
+      framework::Copy(src, dst_place, &dst);
+    } else {
+      auto lod_and_offset = GetSubLoDAndAbsoluteOffset(lod(), begin, end, 0);
+
+      auto &offset = lod_and_offset.second;
+      auto src = Slice(offset.first, offset.second);
+      auto &dst_place = places[i];
+      framework::Copy(src, dst_place, &dst);
+
+      LoD my_lod;
+      for (auto &l : lod_and_offset.first) {
+        std::vector<size_t> v{0};
+        for (auto &ll : l) {
+          v.push_back(ll + v.back());
+        }
+        my_lod.emplace_back(v);
+      }
+      dst.set_lod(my_lod);
+    }
+    results.emplace_back(dst);
+  }
+
+  return results;
+}
+
+void LoDTensor::MergeLoDTensor(
+    const std::vector<const LoDTensor *> &lod_tensors,
+    platform::Place dst_place) {
+  PADDLE_ENFORCE(!lod_tensors.empty());
+
+  framework::DDim new_dim = lod_tensors[0]->dims();
+  std::type_index new_type = lod_tensors[0]->type();
+  framework::DataLayout new_layout = lod_tensors[0]->layout();
+  LoD new_lod = lod_tensors[0]->lod();
+  for (size_t i = 1; i < lod_tensors.size(); ++i) {
+    auto *t = lod_tensors[i];
+    PADDLE_ENFORCE_EQ(new_type.hash_code(), t->type().hash_code());
+    PADDLE_ENFORCE_EQ(new_layout, t->layout());
+
+    PADDLE_ENFORCE_EQ(framework::product(new_dim) / new_dim[0],
+                      framework::product(t->dims()) / t->dims()[0]);
+    new_dim[0] += t->dims()[0];
+
+    auto &lod = t->lod();
+    for (size_t j = 0; j < lod.size(); ++j) {
+      auto &sub_lod = new_lod[j];
+      auto &offset = sub_lod.back();
+      for (size_t k = 1; k < lod[j].size(); ++k) {
+        sub_lod.push_back(lod[j][k] + offset);
+      }
+    }
+  }
+  Resize(new_dim);
+  set_layout(new_layout);
+  set_lod(new_lod);
+  mutable_data(dst_place, new_type);
+
+  int begin = 0;
+  for (auto *src : lod_tensors) {
+    int end = begin + src->dims()[0];
+    auto dst = Slice(begin, end);
+    framework::Copy(*src, dst_place, &dst);
+    begin = end;
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..1509a9fb1347659f7526c6892f632feb8c84579c
--- /dev/null
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -0,0 +1,213 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#ifdef PADDLE_WITH_CUDA
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#endif
+
+#include <glog/logging.h>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+
+/*
+ * LoD is short for Level of Details.
+ *
+ * - in a level, each element indicates relative offset of the lower level
+ * - the first element should be 0 and that indicates that this sequence start
+ * from 0
+ * - each sequence's begin and end(no-inclusive) is level[id, id+1]
+ *
+ * For example:
+ *    3-level LoD stores
+ *
+ *    0 2 3
+ *    0 2 4 7
+ *    0 2 5 7 10 12 15 20
+ */
+using LoD = std::vector<Vector<size_t>>;
+
+std::ostream& operator<<(std::ostream& os, const LoD& lod);
+std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
+
+std::string LoDToString(const LoD& lod);
+
+LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
+                 size_t elem_end);
+/*
+ * Transform an LoD from relative offsets to absolute offsets.
+ */
+LoD ToAbsOffset(const LoD& in);
+
+bool operator==(const LoD& a, const LoD& b);
+
+/*
+ * Check whether this lod's format is valid.
+ *
+ * ATTENTION:
+ *   - Empty lod is treated as valid.
+ *
+ * It will check two things:
+ *
+ *  1. all the offsets in a level should be ascending(no same items allows).
+ *  2. there should be more than 2 offsets existing in each level.
+ *  3. the higher level's last offset should equals the lower level's size-1.
+ *  4. the first offset(the begin offset) of each level should be 0.
+ *  5. the lowest level's last offset should equals `tensor_height` if
+ * tensor_height>0.
+ */
+
+bool CheckLoD(const LoD& in, int tensor_height = -1);
+/*
+ * Check whether this absolute lod's format is valid.
+ *
+ * ATTENTION:
+ *   - Empty lod is treated as valid.
+ *
+ * It will check two things:
+ *  1. all the offsets in a level should be ascending(no same items allows)
+ *  2. there should be more than 2 offsets existing in each level.
+ *  3. the first offset of each level should be 0, and the last should be the
+ *     same(the height of underlying tensor) or `tensor_height` if
+ *     tensor_height>0.
+ */
+bool CheckAbsLoD(const LoD& in, int tensor_height = -1);
+
+/*
+ * LoDTensor (Level of details Tensor)
+ * see https://en.wikipedia.org/wiki/Level_of_details for reference.
+ */
+class LoDTensor : public Tensor {
+ public:
+  LoDTensor() : Tensor() {}
+
+  /* Constructor with place should only be used in pybind */
+  explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
+
+  explicit LoDTensor(const LoD& lod) : lod_(lod) {}
+
+  void set_lod(const LoD& lod) { lod_ = lod; }
+
+  const LoD& lod() const { return lod_; }
+
+  LoD* mutable_lod() { return &lod_; }
+
+  /*
+   * Get the start offset and end offset of an  element from LoD.
+   */
+  std::pair<size_t, size_t> lod_element(size_t level, size_t elem) const {
+    PADDLE_ENFORCE_LT(level, NumLevels());
+    PADDLE_ENFORCE_LT(elem, NumElements(level));
+    return std::make_pair((lod_)[level][elem], (lod_)[level][elem + 1]);
+  }
+
+  /*
+   * Number of LoDTensor's levels, each level has units of data, for example,
+   * in the sentence's view, article, paragraph, sentence are 3 levels.
+   */
+  size_t NumLevels() const { return lod_.size(); }
+  /*
+   * Number of elements in a level.
+   */
+  size_t NumElements(size_t level = 0) const {
+    PADDLE_ENFORCE_LT(level, NumLevels());
+    // the last offset is the end of last element
+    return (lod_)[level].size() - 1;
+  }
+
+  std::vector<LoDTensor> SplitLoDTensor(
+      const std::vector<platform::Place> places) const;
+
+  void MergeLoDTensor(const std::vector<const LoDTensor*>& lod_tensors,
+                      platform::Place place);
+
+ private:
+  LoD lod_;
+};
+
+/*
+ * Expand the `source` to fit the LoD of `lod`. For example, a `source`
+ * LoDTensor is
+ *  - LoD: [0, 2]
+ *  - tensor: [a0, a1]
+ * a `lod` is
+ *  - LoD: [0 3 5]
+ * returns a new LoDTensor
+ *  - [a0 a0 a0 a1 a1]
+ */
+template <typename T>
+LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
+                    const platform::Place& place) {
+  LoD abs_lod = ToAbsOffset(lod);
+  const auto& lod_level = lod[level];
+  size_t num_instances = source.dims()[0];
+
+  // new tensor
+  LoDTensor tensor;
+  tensor.set_lod(lod);
+  auto dims = source.dims();
+  dims[0] = lod_level.back();
+  tensor.Resize(dims);
+  tensor.mutable_data<T>(place);
+
+  PADDLE_ENFORCE_EQ(num_instances, lod_level.size() - 1);
+  for (size_t ins = 0; ins < num_instances; ins++) {
+    for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) {
+      auto slice = tensor.Slice(elem, elem + 1);
+      Copy(source.Slice(ins, ins + 1), platform::CPUPlace(),
+           platform::CPUDeviceContext(), &slice);
+    }
+  }
+  return tensor;
+}
+
+// Get the absolute offset of a lod[start_level][start_idx:end_idx] and
+// relative length of details for every levels(i.e., [start_level: ]).
+//
+// For example,
+//   lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]]
+//   start_level = 0
+//   start_idx = 1
+//   end_idx = 3
+//
+// Returns:
+//  LoD = [[1, 4], [2, 4, 2, 3, 2]]
+//  pair<size_t, size_t> = {11, 24}
+std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
+    const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level);
+
+void AppendLoD(LoD* lod, const LoD& lod_length);
+
+/*
+ * Serialize/Desiralize LoDTensor to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
+                       const platform::DeviceContext& dev_ctx);
+void DeserializeFromStream(std::istream& is, LoDTensor* tensor,
+                           const platform::DeviceContext& dev_ctx);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/lod_tensor.md b/paddle/fluid/framework/lod_tensor.md
similarity index 100%
rename from paddle/framework/lod_tensor.md
rename to paddle/fluid/framework/lod_tensor.md
diff --git a/paddle/fluid/framework/lod_tensor_array.h b/paddle/fluid/framework/lod_tensor_array.h
new file mode 100644
index 0000000000000000000000000000000000000000..652513bd22597000e8249eb19776182d850793aa
--- /dev/null
+++ b/paddle/fluid/framework/lod_tensor_array.h
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+using LoDTensorArray = std::vector<LoDTensor>;
+}
+}  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e0ed2495d68a8cc0d377aaf5b5103aea1064688
--- /dev/null
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -0,0 +1,228 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/lod_tensor.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+
+TEST(LoD, data) {
+  LoD lod{{0, 1, 2}};
+  lod.push_back({0, 2, 4, 5});
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
+
+  auto& v = lod[0];
+  for (size_t i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(v[i], i);
+  }
+}
+
+TEST(LodExpand, test) {
+  LoD lod{{0, 2}};
+  LoDTensor tensor;
+  tensor.set_lod(lod);
+  tensor.Resize({2, 1});
+  tensor.mutable_data<float>(platform::CPUPlace());
+  tensor.data<float>()[0] = 0;
+  tensor.data<float>()[1] = 1;
+
+  LoD target;
+  target.emplace_back(std::vector<size_t>{0, 3, 5});
+  auto new_tensor = LodExpand<float>(tensor, target, 0UL, platform::CPUPlace());
+  std::vector<int> result{{0, 0, 0, 1, 1}};
+  for (size_t i = 0; i < 5; i++) {
+    ASSERT_EQ(new_tensor.data<float>()[i], result[i]);
+  }
+}
+
+TEST(LoD, GetFineGrainedLoDLength) {
+  LoD lod;
+  lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
+  lod.push_back(
+      std::vector<size_t>({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29}));
+
+  auto lod_and_offset =
+      paddle::framework::GetSubLoDAndAbsoluteOffset(lod, 1, 2, 0);
+  LoD lod_length = lod_and_offset.first;
+  size_t start_offset = lod_and_offset.second.first;
+  size_t end_offset = lod_and_offset.second.second;
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>{2});
+  expected.push_back(std::vector<size_t>{2, 2});
+  expected.push_back(std::vector<size_t>{2, 3, 4, 2});
+  EXPECT_EQ(lod_length, expected);
+  EXPECT_EQ(start_offset, 15UL);
+  EXPECT_EQ(end_offset, 26UL);
+}
+
+TEST(LoD, AppendLoD) {
+  LoD lod_lens;
+  lod_lens.push_back(std::vector<size_t>({2}));
+  lod_lens.push_back(std::vector<size_t>({2, 2}));
+  lod_lens.push_back(std::vector<size_t>({2, 3, 4, 2}));
+
+  LoD origin;
+  origin.push_back(std::vector<size_t>({0, 2}));
+  origin.push_back(std::vector<size_t>({0, 1, 6}));
+  origin.push_back(std::vector<size_t>({0, 2, 5, 7, 10, 12, 15}));
+
+  paddle::framework::AppendLoD(&origin, lod_lens);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>({0, 2, 4}));
+  expected.push_back(std::vector<size_t>({0, 1, 6, 8, 10}));
+  expected.push_back(
+      std::vector<size_t>({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26}));
+  EXPECT_EQ(origin, expected);
+}
+
+TEST(LoD, ToAbsOffset) {
+  LoD relative_lod;
+  relative_lod.push_back(std::vector<size_t>({0, 2}));
+  relative_lod.push_back(std::vector<size_t>({0, 1, 3}));
+  relative_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  LoD abs_lod = paddle::framework::ToAbsOffset(relative_lod);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>({0, 5}));
+  expected.push_back(std::vector<size_t>({0, 2, 5}));
+  expected.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  EXPECT_EQ(abs_lod, expected);
+}
+
+TEST(LoD, SplitLoDTensor) {
+  LoD lod;
+  lod.push_back(std::vector<size_t>({0, 2, 4, 5, 6}));
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 13, 15, 20}));
+
+  platform::CPUPlace place;
+  LoDTensor lod_tensor;
+  lod_tensor.Resize({20, 1});
+  float* dst_ptr = lod_tensor.mutable_data<float>(place);
+  for (int i = 0; i < lod_tensor.numel(); ++i) {
+    dst_ptr[i] = i;
+  }
+  lod_tensor.set_lod(lod);
+
+  std::vector<platform::Place> places{platform::CPUPlace(),
+                                      platform::CPUPlace()};
+  LoD lod0;
+  lod0.push_back(std::vector<size_t>({0, 2, 4}));
+  lod0.push_back(std::vector<size_t>({0, 1, 6, 8, 13}));
+  LoD lod1;
+  lod1.push_back(std::vector<size_t>({0, 1, 2}));
+  lod1.push_back(std::vector<size_t>({0, 2, 7}));
+
+  auto lods = lod_tensor.SplitLoDTensor(places);
+  EXPECT_EQ(lods[0].lod(), lod0);
+  EXPECT_EQ(lods[1].lod(), lod1);
+}
+
+TEST(LoD, MergeLoDTensor) {
+  LoD lod;
+  lod.push_back(std::vector<size_t>({0, 2, 4, 5, 6}));
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 13, 15, 20}));
+
+  platform::CPUPlace place;
+
+  LoDTensor lod_tensor0;
+  LoD lod0;
+  lod0.push_back(std::vector<size_t>({0, 2, 4}));
+  lod0.push_back(std::vector<size_t>({0, 1, 6, 8, 13}));
+  lod_tensor0.set_lod(lod0);
+
+  lod_tensor0.Resize({13, 1});
+  float* dst_ptr = lod_tensor0.mutable_data<float>(place);
+  for (int i = 0; i < lod_tensor0.numel(); ++i) {
+    dst_ptr[i] = i;
+  }
+
+  LoDTensor lod_tensor1;
+  LoD lod1;
+  lod1.push_back(std::vector<size_t>({0, 1, 2}));
+  lod1.push_back(std::vector<size_t>({0, 2, 7}));
+  lod_tensor1.set_lod(lod1);
+  lod_tensor1.Resize({7, 1});
+  dst_ptr = lod_tensor1.mutable_data<float>(place);
+  for (int i = 0; i < lod_tensor1.numel(); ++i) {
+    dst_ptr[i] = i;
+  }
+
+  std::vector<const LoDTensor*> lods{&lod_tensor0, &lod_tensor1};
+
+  LoDTensor lod_tensor;
+  lod_tensor.MergeLoDTensor(lods, place);
+  EXPECT_EQ(lod_tensor.lod(), lod);
+}
+
+TEST(LoD, CheckLoD) {
+  LoD relative_lod;
+  relative_lod.push_back(std::vector<size_t>({0, 2}));
+  relative_lod.push_back(std::vector<size_t>({0, 1, 3}));
+  relative_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  // check compatible
+  ASSERT_TRUE(CheckLoD(relative_lod));
+  relative_lod[1].back()++;
+  ASSERT_FALSE(CheckLoD(relative_lod));
+  relative_lod[1].back()--;  // recover it
+
+  // check empty
+  LoD empty_lod;
+  ASSERT_TRUE(CheckLoD(empty_lod));
+
+  // check less than 2 offsets in a level
+  LoD some_lod0;
+  some_lod0.push_back(std::vector<size_t>({0}));
+  ASSERT_FALSE(CheckLoD(some_lod0));
+
+  // check with underlying tensor storage.
+  ASSERT_TRUE(CheckLoD(relative_lod, 5));
+  ASSERT_FALSE(CheckLoD(relative_lod, 9));
+}
+
+TEST(LoD, CheckAbsLoD) {
+  LoD relative_lod;
+  relative_lod.push_back(std::vector<size_t>({0, 2}));
+  relative_lod.push_back(std::vector<size_t>({0, 1, 3}));
+  relative_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  auto abs_lod = ToAbsOffset(relative_lod);
+
+  ASSERT_TRUE(CheckAbsLoD(abs_lod));
+
+  // check less than 2 offsets in a level.
+
+  // check the last item should be compatible with tensor height.
+  abs_lod.back().back()++;
+  ASSERT_FALSE(CheckAbsLoD(abs_lod));
+  abs_lod.back().back()--;  // restore
+
+  // check less than 2 offsets in a lod.
+  LoD abs_lod0;
+  abs_lod0.push_back(std::vector<size_t>({0}));
+  ASSERT_FALSE(CheckAbsLoD(abs_lod0));
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4dd7810c1b25cbfeb7d6d79034a97db3f1d67ebb
--- /dev/null
+++ b/paddle/fluid/framework/lod_tensor_test.cu
@@ -0,0 +1,72 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/place.h"
+
+__global__ void test(size_t* a, int size) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
+       i += blockDim.x * gridDim.x) {
+    a[i] *= 2;
+  }
+}
+
+TEST(LoD, data) {
+  paddle::framework::InitDevices();
+
+  paddle::framework::LoD lod{{0, 1, 2}};
+  lod.push_back({0, 2, 4, 5});
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
+
+  auto& v = lod[0];
+  paddle::platform::CUDAPlace gpu(0);
+  test<<<1, 1>>>(v.CUDAMutableData(gpu), v.size());
+  cudaDeviceSynchronize();
+  for (size_t i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(v[i], i * 2);
+  }
+}
+
+TEST(LoDTensor, LoDInGPU) {
+  paddle::framework::InitDevices();
+
+  paddle::framework::LoDTensor lod_tensor;
+  paddle::platform::CUDAPlace place(0);
+
+  paddle::framework::LoD src_lod;
+  src_lod.push_back(std::vector<size_t>{0, 2, 4, 6, 8, 10, 12, 14});
+
+  lod_tensor.Resize({14, 16});
+  lod_tensor.mutable_data<float>(place);
+
+  lod_tensor.set_lod(src_lod);
+  EXPECT_EQ(lod_tensor.lod_element(0, 2).first, 4UL);
+  EXPECT_EQ(lod_tensor.lod_element(0, 4).first, 8UL);
+
+  auto lod = lod_tensor.lod();
+
+  test<<<1, 8>>>(lod[0].CUDAMutableData(place), lod[0].size());
+  cudaDeviceSynchronize();
+
+  for (size_t i = 0; i < src_lod[0].size(); ++i) {
+    EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
+  }
+}
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..9756754260d46519d181f95e000f39ba92d22ef0
--- /dev/null
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -0,0 +1,363 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <initializer_list>
+#include <vector>
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+
+#include "glog/logging.h"
+
+namespace paddle {
+namespace framework {
+
+// Vector<T> implements the std::vector interface, and can get Data or
+// MutableData from any place. The data will be synced implicitly inside.
+template <typename T>
+class Vector {
+ public:
+  using value_type = T;
+
+  // Default ctor. Create empty Vector
+  Vector() { InitEmpty(); }
+
+  // Fill vector with value. The vector size is `count`.
+  explicit Vector(size_t count, const T& value = T()) {
+    if (count == 0) {
+      InitEmpty();
+    } else {
+      resize(count);
+      T* ptr = begin();
+      for (size_t i = 0; i < count; ++i) {
+        ptr[i] = value;
+      }
+    }
+  }
+
+  // Ctor with init_list
+  Vector(std::initializer_list<T> init) {
+    if (init.size() == 0) {
+      InitEmpty();
+    } else {
+      InitByIter(init.size(), init.begin(), init.end());
+    }
+  }
+
+  // implicit cast from std::vector.
+  template <typename U>
+  Vector(const std::vector<U>& dat) {  // NOLINT
+    if (dat.size() == 0) {
+      InitEmpty();
+    } else {
+      InitByIter(dat.size(), dat.begin(), dat.end());
+    }
+  }
+
+  // Copy ctor
+  Vector(const Vector<T>& other) { this->operator=(other); }
+
+  // Copy operator
+  Vector<T>& operator=(const Vector<T>& other) {
+    if (other.size() != 0) {
+      this->InitByIter(other.size(), other.begin(), other.end());
+    } else {
+      InitEmpty();
+    }
+    return *this;
+  }
+
+  // Move ctor
+  Vector(Vector<T>&& other) {
+    this->size_ = other.size_;
+    this->flag_ = other.flag_;
+    if (other.cuda_vec_.memory_size()) {
+      this->cuda_vec_.ShareDataWith(other.cuda_vec_);
+    }
+    if (other.cpu_vec_.memory_size()) {
+      this->cpu_vec_.ShareDataWith(other.cpu_vec_);
+    }
+  }
+
+  // CPU data access method. Mutable.
+  T& operator[](size_t i) {
+    MutableCPU();
+    return const_cast<T*>(cpu_vec_.data<T>())[i];
+  }
+
+  // CPU data access method. Immutable.
+  const T& operator[](size_t i) const {
+    ImmutableCPU();
+    return cpu_vec_.data<T>()[i];
+  }
+
+  // std::vector iterator methods. Based on CPU data access method
+  size_t size() const { return size_; }
+
+  T* begin() { return &this->operator[](0); }
+
+  T* end() { return &this->operator[](size()); }
+
+  T& front() { return *begin(); }
+
+  T& back() {
+    auto it = end();
+    --it;
+    return *it;
+  }
+
+  const T* begin() const { return &this->operator[](0); }
+  const T* end() const { return &this->operator[](size()); }
+
+  const T& back() const {
+    auto it = end();
+    --it;
+    return *it;
+  }
+
+  T* data() { return begin(); }
+
+  const T* data() const { return begin(); }
+
+  const T& front() const { return *begin(); }
+  // end of std::vector iterator methods
+
+  // assign this from iterator.
+  // NOTE: the iterator must support `end-begin`
+  template <typename Iter>
+  void assign(Iter begin, Iter end) {
+    InitByIter(end - begin, begin, end);
+  }
+
+  // push_back. If the previous capacity is not enough, the memory will
+  // double.
+  void push_back(T elem) {
+    if (size_ + 1 > capacity()) {
+      reserve((size_ + 1) << 1);
+    }
+    *end() = elem;
+    ++size_;
+  }
+
+  // extend a vector by iterator.
+  // NOTE: the iterator must support end-begin
+  template <typename It>
+  void Extend(It begin, It end) {
+    size_t pre_size = size_;
+    resize(pre_size + (end - begin));
+    T* ptr = this->begin() + pre_size;
+    for (; begin < end; ++begin, ++ptr) {
+      *ptr = *begin;
+    }
+  }
+
+  // resize the vector
+  void resize(size_t size) {
+    if (size + 1 < capacity()) {
+      size_ = size;
+    } else {
+      MutableCPU();
+      Tensor cpu_tensor;
+      platform::Place cpu = platform::CPUPlace();
+      T* ptr = cpu_tensor.mutable_data<T>(
+          framework::make_ddim({static_cast<int64_t>(size)}), cpu);
+      const T* old_ptr =
+          cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
+      if (old_ptr != nullptr) {
+        std::copy(old_ptr, old_ptr + size_, ptr);
+      }
+      size_ = size;
+      cpu_vec_.ShareDataWith(cpu_tensor);
+    }
+  }
+
+  // get cuda ptr. immutable
+  const T* CUDAData(platform::Place place) const {
+    PADDLE_ENFORCE(platform::is_gpu_place(place),
+                   "CUDA Data must on CUDA place");
+    ImmutableCUDA(place);
+    return cuda_vec_.data<T>();
+  }
+
+  // get cuda ptr. mutable
+  T* CUDAMutableData(platform::Place place) {
+    const T* ptr = CUDAData(place);
+    flag_ = kDirty | kDataInCUDA;
+    return const_cast<T*>(ptr);
+  }
+
+  // clear
+  void clear() {
+    size_ = 0;
+    flag_ = kDirty | kDataInCPU;
+  }
+
+  size_t capacity() const {
+    return cpu_vec_.memory_size() / SizeOfType(typeid(T));
+  }
+
+  // reserve data
+  void reserve(size_t size) {
+    size_t pre_size = size_;
+    resize(size);
+    resize(pre_size);
+  }
+
+  // the unify method to access CPU or CUDA data. immutable.
+  const T* Data(platform::Place place) const {
+    if (platform::is_gpu_place(place)) {
+      return CUDAData(place);
+    } else {
+      return data();
+    }
+  }
+
+  // the unify method to access CPU or CUDA data. mutable.
+  T* MutableData(platform::Place place) {
+    if (platform::is_gpu_place(place)) {
+      return CUDAMutableData(place);
+    } else {
+      return data();
+    }
+  }
+
+  // implicit cast operator. Vector can be cast to std::vector implicitly.
+  operator std::vector<T>() const {
+    std::vector<T> result;
+    result.resize(size());
+    std::copy(begin(), end(), result.begin());
+    return result;
+  }
+
+  bool operator==(const Vector<T>& other) const {
+    if (size() != other.size()) return false;
+    for (auto it1 = begin(), it2 = other.begin(); it1 < end(); ++it1, ++it2) {
+      if (*it1 != *it2) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+ private:
+  void InitEmpty() {
+    size_ = 0;
+    flag_ = kDataInCPU;
+  }
+
+  template <typename Iter>
+  void InitByIter(size_t size, Iter begin, Iter end) {
+    platform::Place cpu = platform::CPUPlace();
+    T* ptr = this->cpu_vec_.template mutable_data<T>(
+        framework::make_ddim({static_cast<int64_t>(size)}), cpu);
+    for (size_t i = 0; i < size; ++i) {
+      *ptr++ = *begin++;
+    }
+    flag_ = kDataInCPU | kDirty;
+    size_ = size;
+  }
+
+  enum DataFlag {
+    kDataInCPU = 0x01,
+    kDataInCUDA = 0x02,
+    // kDirty means the data has been changed in one device.
+    kDirty = 0x10
+  };
+
+  void CopyToCPU() const {
+    // COPY GPU Data To CPU
+    Copy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
+    WaitPlace(cuda_vec_.place());
+  }
+
+  void MutableCPU() {
+    if (IsInCUDA() && IsDirty()) {
+      CopyToCPU();
+    }
+    flag_ = kDirty | kDataInCPU;
+  }
+
+  void ImmutableCUDA(platform::Place place) const {
+    if (IsDirty()) {
+      if (IsInCPU()) {
+        Copy(cpu_vec_, boost::get<platform::CUDAPlace>(place), &cuda_vec_);
+        WaitPlace(place);
+        UnsetFlag(kDirty);
+        SetFlag(kDataInCUDA);
+      } else if (IsInCUDA() && !(place == cuda_vec_.place())) {
+        framework::Tensor tmp;
+        Copy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
+        WaitPlace(cuda_vec_.place());
+        cuda_vec_.ShareDataWith(tmp);
+        // Still dirty
+      } else {
+        // Dirty && DataInCUDA && Device is same
+        // Do nothing
+      }
+    } else {
+      if (!IsInCUDA()) {
+        // Even data is not dirty. However, data is not in CUDA. Copy data.
+        Copy(cpu_vec_, boost::get<platform::CUDAPlace>(place), &cuda_vec_);
+        WaitPlace(place);
+        SetFlag(kDataInCUDA);
+      } else if (!(place == cuda_vec_.place())) {
+        framework::Tensor tmp;
+        WaitPlace(cuda_vec_.place());
+        Copy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
+        WaitPlace(cuda_vec_.place());
+        WaitPlace(place);
+        cuda_vec_.ShareDataWith(tmp);
+      } else {
+        // Not Dirty && DataInCUDA && Device is same
+        // Do nothing.
+      }
+    }
+  }
+
+  void ImmutableCPU() const {
+    if (IsDirty() &&
+        !IsInCPU()) {  // If data has been changed in CUDA, or CPU has no data.
+      CopyToCPU();
+      UnsetFlag(kDirty);
+    }
+    SetFlag(kDataInCPU);
+  }
+
+  void UnsetFlag(int flag) const { flag_ &= ~flag; }
+  void SetFlag(int flag) const { flag_ |= flag; }
+
+  bool IsDirty() const { return flag_ & kDirty; }
+
+  bool IsInCUDA() const { return flag_ & kDataInCUDA; }
+
+  bool IsInCPU() const { return flag_ & kDataInCPU; }
+
+  static void WaitPlace(const platform::Place place) {
+    if (platform::is_gpu_place(place)) {
+      platform::DeviceContextPool::Instance()
+          .Get(boost::get<platform::CUDAPlace>(place))
+          ->Wait();
+    }
+  }
+
+  mutable int flag_;
+  mutable Tensor cpu_vec_;
+  mutable Tensor cuda_vec_;
+  size_t size_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a89064525661af71b22f18f835fd7b111956847b
--- /dev/null
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <cuda_runtime.h>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+template <typename T>
+using vec = paddle::framework::Vector<T>;
+
+TEST(mixed_vector, CPU_VECTOR) {
+  vec<int> tmp;
+  for (int i = 0; i < 10; ++i) {
+    tmp.push_back(i);
+  }
+  ASSERT_EQ(tmp.size(), 10);
+  vec<int> tmp2;
+  tmp2 = tmp;
+  ASSERT_EQ(tmp2.size(), 10);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(tmp2[i], i);
+    ASSERT_EQ(tmp2[i], tmp[i]);
+  }
+  int cnt = 0;
+  for (auto& t : tmp2) {
+    ASSERT_EQ(t, cnt);
+    ++cnt;
+  }
+}
+
+static __global__ void multiply_10(int* ptr) {
+  for (int i = 0; i < 10; ++i) {
+    ptr[i] *= 10;
+  }
+}
+
+cudaStream_t GetCUDAStream(paddle::platform::CUDAPlace place) {
+  return reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
+             paddle::platform::DeviceContextPool::Instance().Get(place))
+      ->stream();
+}
+
+TEST(mixed_vector, GPU_VECTOR) {
+  vec<int> tmp;
+  for (int i = 0; i < 10; ++i) {
+    tmp.push_back(i);
+  }
+  ASSERT_EQ(tmp.size(), 10);
+  paddle::platform::CUDAPlace gpu(0);
+
+  multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu));
+
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(tmp[i], i * 10);
+  }
+}
+
+TEST(mixed_vector, MultiGPU) {
+  if (paddle::platform::GetCUDADeviceCount() < 2) {
+    LOG(WARNING) << "Skip mixed_vector.MultiGPU since there are not multiple "
+                    "GPUs in your machine.";
+    return;
+  }
+
+  vec<int> tmp;
+  for (int i = 0; i < 10; ++i) {
+    tmp.push_back(i);
+  }
+  ASSERT_EQ(tmp.size(), 10);
+  paddle::platform::CUDAPlace gpu0(0);
+  paddle::platform::SetDeviceId(0);
+  multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0));
+  paddle::platform::CUDAPlace gpu1(1);
+  auto* gpu1_ptr = tmp.MutableData(gpu1);
+  paddle::platform::SetDeviceId(1);
+  multiply_10<<<1, 1, 0, GetCUDAStream(gpu1)>>>(gpu1_ptr);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(tmp[i], i * 100);
+  }
+}
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cbc15e60b83397ed8420bc7a4cd716ef15979554
--- /dev/null
+++ b/paddle/fluid/framework/op_desc.cc
@@ -0,0 +1,521 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_desc.h"
+#include <functional>
+#include <mutex>
+#include <unordered_map>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/shape_inference.h"
+
+namespace paddle {
+namespace framework {
+
+class OpDesc;
+class BlockDesc;
+class CompileTimeInferShapeContext : public InferShapeContext {
+ public:
+  CompileTimeInferShapeContext(const OpDesc &op, const BlockDesc &block);
+
+  bool HasInput(const std::string &name) const override;
+
+  bool HasOutput(const std::string &name) const override;
+
+  bool HasInputs(const std::string &name) const override;
+
+  bool HasOutputs(const std::string &name) const override;
+
+  AttrReader Attrs() const override;
+
+  const std::vector<std::string> &Inputs(
+      const std::string &name) const override;
+
+  const std::vector<std::string> &Outputs(
+      const std::string &name) const override;
+
+  void ShareLoD(const std::string &in, const std::string &out, size_t i = 0,
+                size_t j = 0) const override {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
+    auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
+    if (in_var->GetType() != proto::VarDesc::LOD_TENSOR) {
+      VLOG(3) << "input " << in << " is not LodTensor";
+      return;
+    }
+    PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarDesc::LOD_TENSOR,
+                      "The %d-th output of Output(%s) must be LoDTensor.", j,
+                      out);
+    out_var->SetLoDLevel(in_var->GetLoDLevel());
+  }
+
+  bool IsRuntime() const override;
+
+ protected:
+  proto::VarDesc::VarType GetVarType(const std::string &name) const override;
+
+  DDim GetDim(const std::string &name) const override;
+
+  void SetDim(const std::string &name, const DDim &dim) override;
+
+  std::vector<DDim> GetRepeatedDims(const std::string &name) const override;
+
+  void SetRepeatedDims(const std::string &name,
+                       const std::vector<DDim> &dims) override;
+
+  InferShapeVarPtr GetVarPtr(const std::string &name) override;
+
+  const OpDesc &op_;
+  const BlockDesc &block_;
+};
+
+OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
+               const VariableNameMap &outputs, const AttributeMap &attrs) {
+  desc_.set_type(type);
+  inputs_ = inputs;
+  outputs_ = outputs;
+  attrs_ = attrs;
+  need_update_ = true;
+}
+
+void OpDesc::CopyFrom(const OpDesc &op_desc) {
+  desc_.set_type(op_desc.Type());
+  inputs_ = op_desc.inputs_;
+  outputs_ = op_desc.outputs_;
+  attrs_ = op_desc.attrs_;
+  need_update_ = true;
+}
+
+OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
+    : desc_(desc), need_update_(false) {
+  // restore inputs_
+  int input_size = desc_.inputs_size();
+  for (int i = 0; i < input_size; ++i) {
+    const proto::OpDesc::Var &var = desc_.inputs(i);
+    std::vector<std::string> &args = inputs_[var.parameter()];
+    int argu_size = var.arguments_size();
+    args.reserve(argu_size);
+    for (int j = 0; j < argu_size; ++j) {
+      args.push_back(var.arguments(j));
+    }
+  }
+  // restore outputs_
+  int output_size = desc_.outputs_size();
+  for (int i = 0; i < output_size; ++i) {
+    const proto::OpDesc::Var &var = desc_.outputs(i);
+    std::vector<std::string> &args = outputs_[var.parameter()];
+    int argu_size = var.arguments_size();
+    args.reserve(argu_size);
+    for (int j = 0; j < argu_size; ++j) {
+      args.push_back(var.arguments(j));
+    }
+  }
+  // restore attrs_
+  for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
+    std::string attr_name = attr.name();
+    // The sub_block referred to by the BLOCK attr hasn't been added
+    // to ProgramDesc class yet, we skip setting BLOCK attr here.
+    if (attr.type() != proto::AttrType::BLOCK) {
+      attrs_[attr_name] = GetAttrValue(attr);
+    }
+  }
+  this->block_ = block;
+}
+
+proto::OpDesc *OpDesc::Proto() {
+  Flush();
+  return &desc_;
+}
+
+const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
+  auto it = inputs_.find(name);
+  PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name,
+                 Type());
+  return it->second;
+}
+
+std::vector<std::string> OpDesc::InputArgumentNames() const {
+  std::vector<std::string> retv;
+  for (auto &ipt : this->inputs_) {
+    retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
+  }
+  return retv;
+}
+
+void OpDesc::SetInput(const std::string &param_name,
+                      const std::vector<std::string> &args) {
+  need_update_ = true;
+  inputs_[param_name] = args;
+}
+
+const std::vector<std::string> &OpDesc::Output(const std::string &name) const {
+  auto it = outputs_.find(name);
+  PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s",
+                 name, Type());
+  return it->second;
+}
+
+std::vector<std::string> OpDesc::OutputArgumentNames() const {
+  std::vector<std::string> retv;
+  for (auto &ipt : this->outputs_) {
+    retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
+  }
+  return retv;
+}
+
+void OpDesc::SetOutput(const std::string &param_name,
+                       const std::vector<std::string> &args) {
+  need_update_ = true;
+  this->outputs_[param_name] = args;
+}
+
+proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
+  auto it = attrs_.find(name);
+  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  return static_cast<proto::AttrType>(it->second.which() - 1);
+}
+
+std::vector<std::string> OpDesc::AttrNames() const {
+  std::vector<std::string> retv;
+  retv.reserve(attrs_.size());
+  for (auto &attr : attrs_) {
+    retv.push_back(attr.first);
+  }
+  return retv;
+}
+
+void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
+  this->attrs_[name] = v;
+  need_update_ = true;
+}
+
+void OpDesc::SetBlockAttr(const std::string &name, BlockDesc &block) {
+  this->attrs_[name] = &block;
+  need_update_ = true;
+}
+
+void OpDesc::SetAttrMap(
+    const std::unordered_map<std::string, Attribute> &attr_map) {
+  attrs_ = attr_map;
+  need_update_ = true;
+}
+
+Attribute OpDesc::GetAttr(const std::string &name) const {
+  auto it = attrs_.find(name);
+  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  return it->second;
+}
+
+int OpDesc::GetBlockAttr(const std::string &name) const {
+  auto it = attrs_.find(name);
+  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  return boost::get<BlockDesc *>(it->second)->ID();
+}
+
+const std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() const {
+  return attrs_;
+}
+
+void OpDesc::Rename(const std::string &old_name, const std::string &new_name) {
+  for (auto &input : inputs_) {
+    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
+  }
+  for (auto &output : outputs_) {
+    std::replace(output.second.begin(), output.second.end(), old_name,
+                 new_name);
+  }
+  need_update_ = true;
+}
+
+void OpDesc::RenameOutput(const std::string &old_name,
+                          const std::string &new_name) {
+  for (auto &output : outputs_) {
+    std::replace(output.second.begin(), output.second.end(), old_name,
+                 new_name);
+  }
+  need_update_ = true;
+}
+
+void OpDesc::RenameInput(const std::string &old_name,
+                         const std::string &new_name) {
+  for (auto &input : inputs_) {
+    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
+  }
+  need_update_ = true;
+}
+
+struct SetAttrDescVisitor : public boost::static_visitor<void> {
+  explicit SetAttrDescVisitor(proto::OpDesc::Attr *attr) : attr_(attr) {}
+  mutable proto::OpDesc::Attr *attr_;
+  void operator()(int v) const { attr_->set_i(v); }
+  void operator()(float v) const { attr_->set_f(v); }
+  void operator()(const std::string &v) const { attr_->set_s(v); }
+
+  // Please refer to https://github.com/PaddlePaddle/Paddle/issues/7162
+  template <class T,
+            class = typename std::enable_if<std::is_same<bool, T>::value>::type>
+  void operator()(T b) const {
+    attr_->set_b(b);
+  }
+
+  void operator()(const std::vector<int> &v) const {
+    VectorToRepeated(v, attr_->mutable_ints());
+  }
+  void operator()(const std::vector<float> &v) const {
+    VectorToRepeated(v, attr_->mutable_floats());
+  }
+  void operator()(const std::vector<std::string> &v) const {
+    VectorToRepeated(v, attr_->mutable_strings());
+  }
+  void operator()(const std::vector<bool> &v) const {
+    VectorToRepeated(v, attr_->mutable_bools());
+  }
+  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
+  void operator()(int64_t v) const { attr_->set_l(v); }
+  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
+};
+
+void OpDesc::Flush() {
+  if (need_update_) {
+    this->desc_.mutable_inputs()->Clear();
+    for (auto &ipt : inputs_) {
+      auto *input = desc_.add_inputs();
+      input->set_parameter(ipt.first);
+      VectorToRepeated(ipt.second, input->mutable_arguments());
+    }
+
+    this->desc_.mutable_outputs()->Clear();
+    for (auto &opt : outputs_) {
+      auto *output = desc_.add_outputs();
+      output->set_parameter(opt.first);
+      VectorToRepeated(opt.second, output->mutable_arguments());
+    }
+
+    this->desc_.mutable_attrs()->Clear();
+    for (auto &attr : attrs_) {
+      auto *attr_desc = desc_.add_attrs();
+      attr_desc->set_name(attr.first);
+      attr_desc->set_type(
+          static_cast<proto::AttrType>(attr.second.which() - 1));
+      SetAttrDescVisitor visitor(attr_desc);
+      boost::apply_visitor(visitor, attr.second);
+    }
+
+    need_update_ = false;
+  }
+}
+
+static std::once_flag init_infer_shape_funcs;
+
+static void InitInferShapeFuncs() {
+  std::call_once(init_infer_shape_funcs, [] {
+    auto &map = OpInfoMap::Instance();
+    auto &info_map = *map.mutable_map();
+
+    for (auto &kern_pair : OperatorWithKernel::AllOpKernels()) {
+      auto op_type = kern_pair.first;
+      auto &op_info = info_map.at(op_type);
+      auto op = static_cast<OperatorWithKernel *>(op_info.Creator()(
+          "", VariableNameMap{}, VariableNameMap{}, AttributeMap{}));
+      if (op_info.infer_shape_) {  // infer_shape has been registered.
+        continue;
+      }
+      op_info.infer_shape_ = [op](InferShapeContext *ctx) {
+        op->InferShape(ctx);
+      };
+    }
+  });
+}
+
+void OpDesc::CheckAttrs() {
+  PADDLE_ENFORCE(!Type().empty(),
+                 "CheckAttr() can not be called before type is setted.");
+  auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
+  if (checker == nullptr) {
+    // checker is not configured. That operator could be generated by Paddle,
+    // not by users.
+    return;
+  }
+  checker->Check(attrs_);
+}
+
+void OpDesc::InferShape(const BlockDesc &block) const {
+  VLOG(3) << "CompileTime infer shape on " << Type();
+  InitInferShapeFuncs();
+  auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
+  PADDLE_ENFORCE(static_cast<bool>(infer_shape),
+                 "%s's infer_shape has not been registered", this->Type());
+  CompileTimeInferShapeContext ctx(*this, block);
+  if (VLOG_IS_ON(10)) {
+    std::ostringstream sout;
+    auto inames = this->InputArgumentNames();
+    sout << " From [";
+    std::copy(inames.begin(), inames.end(),
+              std::ostream_iterator<std::string>(sout, ", "));
+    sout << "] to [";
+    auto onames = this->OutputArgumentNames();
+    std::copy(onames.begin(), onames.end(),
+              std::ostream_iterator<std::string>(sout, ", "));
+    sout << "]";
+    VLOG(10) << sout.str();
+  }
+  infer_shape(&ctx);
+}
+
+void OpDesc::InferVarType(BlockDesc *block) const {
+  auto &info = OpInfoMap::Instance().Get(this->Type());
+  if (info.infer_var_type_) {
+    info.infer_var_type_(*this, block);
+  } else {
+    // all output type is LoDTensor by default
+    VLOG(10) << this->Type()
+             << " has not registered InferVarType. Set output variables to "
+                "LOD_TENSOR";
+    for (auto &out_pair : this->outputs_) {
+      for (auto &out_var_name : out_pair.second) {
+        block->FindRecursiveOrCreateVar(out_var_name)
+            .SetType(proto::VarDesc::LOD_TENSOR);
+      }
+    }
+  }
+}
+
+CompileTimeInferShapeContext::CompileTimeInferShapeContext(
+    const OpDesc &op, const BlockDesc &block)
+    : op_(op), block_(block) {}
+
+bool CompileTimeInferShapeContext::HasInput(const std::string &name) const {
+  const std::vector<std::string> &input_names = op_.Input(name);
+  auto length = input_names.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Input(%s) should have only one value, "
+                    "but it have %d now",
+                    name, length);
+  return block_.HasVarRecursive(input_names[0]);
+}
+
+bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const {
+  const std::vector<std::string> &output_names = op_.Output(name);
+  auto length = output_names.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Output(%s) should have only one value, "
+                    "but it have %d now",
+                    name, length);
+  return block_.HasVarRecursive(output_names[0]);
+}
+
+bool CompileTimeInferShapeContext::HasInputs(const std::string &name) const {
+  const std::vector<std::string> &input_names = op_.Input(name);
+  if (input_names.empty()) {
+    return false;
+  }
+  for (auto &input : input_names) {
+    if (!block_.HasVarRecursive(input)) return false;
+  }
+  return true;
+}
+
+bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const {
+  const std::vector<std::string> &output_names = op_.Output(name);
+  if (output_names.empty()) {
+    return false;
+  }
+  for (auto &output : output_names) {
+    if (!block_.HasVarRecursive(output)) return false;
+  }
+  return true;
+}
+
+AttrReader CompileTimeInferShapeContext::Attrs() const {
+  return AttrReader(op_.GetAttrMap());
+}
+
+const std::vector<std::string> &CompileTimeInferShapeContext::Inputs(
+    const std::string &name) const {
+  return op_.Input(name);
+}
+
+const std::vector<std::string> &CompileTimeInferShapeContext::Outputs(
+    const std::string &name) const {
+  return op_.Output(name);
+}
+
+DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
+  auto var = block_.FindVarRecursive(name);
+  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  DDim res;
+  try {
+    auto shape = var->GetShape();
+    res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape);
+  } catch (...) {
+    VLOG(5) << "GetDim of variable " << name << " error";
+    std::rethrow_exception(std::current_exception());
+  }
+  return res;
+}
+
+std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
+    const std::string &name) const {
+  auto var = block_.FindVarRecursive(name);
+  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  std::vector<DDim> res;
+  try {
+    auto shapes = var->GetShapes();
+    for (const auto &s : shapes) {
+      res.push_back(s.empty() ? make_ddim({0UL}) : make_ddim(s));
+    }
+  } catch (...) {
+    VLOG(5) << "GetRepeatedDim of variable " << name << " error.";
+    std::rethrow_exception(std::current_exception());
+  }
+  return res;
+}
+
+void CompileTimeInferShapeContext::SetDim(const std::string &name,
+                                          const DDim &dim) {
+  block_.FindVarRecursive(name)->SetShape(vectorize(dim));
+}
+
+void CompileTimeInferShapeContext::SetRepeatedDims(
+    const std::string &name, const std::vector<DDim> &dims) {
+  auto var = block_.FindVarRecursive(name);
+  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  std::vector<std::vector<int64_t>> dim_vec(dims.size());
+  std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize);
+  var->SetShapes(dim_vec);
+}
+
+bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
+
+proto::VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
+    const std::string &name) const {
+  return block_.FindVarRecursive(name)->GetType();
+}
+
+InferShapeVarPtr CompileTimeInferShapeContext::GetVarPtr(
+    const std::string &name) {
+  return block_.FindVarRecursive(name);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..698df829e56e1182e742db926a712497ee2b6966
--- /dev/null
+++ b/paddle/fluid/framework/op_desc.h
@@ -0,0 +1,154 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/framework/var_desc.h"
+
+namespace paddle {
+namespace framework {
+
+class BlockDesc;
+class ProgramDesc;
+class OpDesc {
+ public:
+  OpDesc() {}
+
+  OpDesc(const std::string &type, const VariableNameMap &inputs,
+         const VariableNameMap &outputs, const AttributeMap &attrs);
+
+  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block);
+
+  explicit OpDesc(BlockDesc *block) : block_(block) {}
+
+  OpDesc(const OpDesc &other, BlockDesc *block) {
+    *this = other;
+    block_ = block;
+  }
+
+  void CopyFrom(const OpDesc &op_desc);
+
+  proto::OpDesc *Proto();
+
+  std::string Type() const { return desc_.type(); }
+
+  void SetType(const std::string &type) { desc_.set_type(type); }
+
+  const std::vector<std::string> &Input(const std::string &name) const;
+
+  std::vector<std::string> InputArgumentNames() const;
+
+  void SetInput(const std::string &param_name,
+                const std::vector<std::string> &args);
+
+  const std::vector<std::string> &Output(const std::string &name) const;
+
+  std::vector<std::string> OutputArgumentNames() const;
+
+  void SetOutput(const std::string &param_name,
+                 const std::vector<std::string> &args);
+
+  bool HasAttr(const std::string &name) const {
+    return attrs_.find(name) != attrs_.end();
+  }
+
+  proto::AttrType GetAttrType(const std::string &name) const;
+
+  std::vector<std::string> AttrNames() const;
+
+  void SetAttr(const std::string &name, const Attribute &v);
+
+  void SetBlockAttr(const std::string &name, BlockDesc &block);
+
+  Attribute GetAttr(const std::string &name) const;
+
+  int GetBlockAttr(const std::string &name) const;
+
+  void Rename(const std::string &old_name, const std::string &new_name);
+
+  void RenameOutput(const std::string &old_name, const std::string &new_name);
+
+  void RenameInput(const std::string &old_name, const std::string &new_name);
+
+  // Only be used in C++
+  const AttributeMap &GetAttrMap() const;
+
+  // Only be used in C++
+  void SetAttrMap(const AttributeMap &attr_map);
+
+  std::vector<std::string> InputNames() const { return MapKeys(inputs_); }
+  std::vector<std::string> OutputNames() const { return MapKeys(outputs_); }
+
+  void SetInputMap(const VariableNameMap &input) {
+    this->inputs_ = input;
+    this->need_update_ = true;
+  }
+
+  void SetOutputMap(const VariableNameMap &output) {
+    this->outputs_ = output;
+    this->need_update_ = true;
+  }
+
+  const VariableNameMap &Inputs() const { return inputs_; }
+
+  const VariableNameMap &Outputs() const { return outputs_; }
+
+  AttributeMap *MutableAttrMap() {
+    this->need_update_ = true;
+    return &this->attrs_;
+  }
+
+  void CheckAttrs();
+
+  void InferShape(const BlockDesc &block) const;
+
+  void InferVarType(BlockDesc *block) const;
+
+  void MarkAsTarget() { desc_.set_is_target(true); }
+
+  void Flush();
+
+  BlockDesc *Block() { return this->block_; }
+
+  void SetBlock(BlockDesc *block) { this->block_ = block; }
+
+ private:
+  template <typename MapType>
+  static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {
+    std::vector<typename MapType::key_type> ret_val;
+    ret_val.reserve(map.size());
+    std::transform(
+        map.begin(), map.end(), std::back_inserter(ret_val),
+        [](const typename MapType::value_type &pair) { return pair.first; });
+    return ret_val;
+  }
+
+  proto::OpDesc desc_;
+  BlockDesc *block_;  // not_own
+  // input arg name => input variable names
+  VariableNameMap inputs_;
+  // output arg name => output variable names
+  VariableNameMap outputs_;
+  AttributeMap attrs_;
+
+  // need_update_ indicate there some local changes not be synchronized. If
+  // local changes should be synchronized, need_update_ should be set to true.
+  bool need_update_{false};
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/op_info.cc b/paddle/fluid/framework/op_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..703c9c3234b62e80c3f768ddb892584c1c0070c0
--- /dev/null
+++ b/paddle/fluid/framework/op_info.cc
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_info.h"
+
+namespace paddle {
+namespace framework {
+
+static OpInfoMap* g_op_info_map = nullptr;
+
+OpInfoMap& OpInfoMap::Instance() {
+  if (g_op_info_map == nullptr) {
+    g_op_info_map = new OpInfoMap();
+  }
+  return *g_op_info_map;
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6b3ff9e653196b9234e02131f37d5964c4f6e84
--- /dev/null
+++ b/paddle/fluid/framework/op_info.h
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <functional>
+#include <map>
+#include <string>
+#include <unordered_map>
+
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace framework {
+
+class InferShapeBase {
+ public:
+  virtual ~InferShapeBase() = default;
+  virtual void operator()(InferShapeContext*) const = 0;
+};
+
+struct OpInfo {
+  OpCreator creator_;
+  GradOpMakerFN grad_op_maker_;
+  proto::OpProto* proto_{nullptr};
+  OpAttrChecker* checker_{nullptr};
+  InferVarTypeFN infer_var_type_;
+  InferShapeFN infer_shape_;
+
+  bool HasOpProtoAndChecker() const {
+    return proto_ != nullptr && checker_ != nullptr;
+  }
+
+  const proto::OpProto& Proto() const {
+    PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
+    PADDLE_ENFORCE(proto_->IsInitialized(),
+                   "Operator Proto must be initialized in op info");
+    return *proto_;
+  }
+
+  const OpCreator& Creator() const {
+    PADDLE_ENFORCE_NOT_NULL(creator_,
+                            "Operator Creator has not been registered");
+    return creator_;
+  }
+
+  const GradOpMakerFN& GradOpMaker() const {
+    PADDLE_ENFORCE_NOT_NULL(grad_op_maker_,
+                            "Operator GradOpMaker has not been registered.");
+    return grad_op_maker_;
+  }
+
+  const OpAttrChecker* Checker() const { return checker_; }
+};
+
+class OpInfoMap {
+ public:
+  static OpInfoMap& Instance();
+
+  bool Has(const std::string& op_type) const {
+    return map_.find(op_type) != map_.end();
+  }
+
+  void Insert(const std::string& type, const OpInfo& info) {
+    PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
+    map_.insert({type, info});
+  }
+
+  const OpInfo& Get(const std::string& type) const {
+    auto op_info_ptr = GetNullable(type);
+    PADDLE_ENFORCE_NOT_NULL(op_info_ptr, "Operator %s has not been registered",
+                            type);
+    return *op_info_ptr;
+  }
+
+  const OpInfo* GetNullable(const std::string& type) const {
+    auto it = map_.find(type);
+    if (it == map_.end()) {
+      return nullptr;
+    } else {
+      return &it->second;
+    }
+  }
+
+  const std::unordered_map<std::string, OpInfo>& map() const { return map_; }
+
+  std::unordered_map<std::string, OpInfo>* mutable_map() { return &map_; }
+
+ private:
+  OpInfoMap() = default;
+  std::unordered_map<std::string, OpInfo> map_;
+
+  DISABLE_COPY_AND_ASSIGN(OpInfoMap);
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5dbff26d7edc212a270d4d187dbb868068790c9
--- /dev/null
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/library_type.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+
+struct OpKernelType {
+  struct Hash {
+    size_t operator()(const OpKernelType& key) const {
+      int place = key.place_.which();
+      int data_type = static_cast<int>(key.data_type_) << LEFT_SHIFT;
+      int data_layout = static_cast<int>(key.data_layout_) << (LEFT_SHIFT * 2);
+      int library_type = static_cast<int>(key.library_type_)
+                         << (LEFT_SHIFT * 3);
+
+      std::hash<int> hasher;
+      return hasher(place + data_type + data_layout + library_type);
+    }
+  };
+
+  // place, data_type, library_type kinds less than 2^8
+  constexpr static int LEFT_SHIFT = 8;
+
+  proto::DataType data_type_;
+  DataLayout data_layout_;
+  platform::Place place_;
+  LibraryType library_type_;
+
+  OpKernelType(proto::DataType data_type, platform::Place place,
+               DataLayout data_layout = DataLayout::kAnyLayout,
+               LibraryType library_type = LibraryType::kPlain)
+      : data_type_(data_type),
+        data_layout_(data_layout),
+        place_(place),
+        library_type_(library_type) {}
+
+  OpKernelType(proto::DataType data_type,
+               const platform::DeviceContext& dev_ctx,
+               DataLayout data_layout = DataLayout::kAnyLayout,
+               LibraryType library_type = LibraryType::kPlain)
+      : data_type_(data_type),
+        data_layout_(data_layout),
+        place_(dev_ctx.GetPlace()),
+        library_type_(library_type) {}
+
+  bool operator==(const OpKernelType& o) const {
+    return platform::places_are_same_class(place_, o.place_) &&
+           data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
+           library_type_ == o.library_type_;
+  }
+
+  bool operator!=(const OpKernelType& o) const { return !(*this == o); }
+};
+
+inline std::ostream& operator<<(std::ostream& os,
+                                const OpKernelType& kernel_key) {
+  os << "data_type[" << kernel_key.data_type_ << "]:data_layout["
+     << kernel_key.data_layout_ << "]:place[" << kernel_key.place_
+     << "]:library_type[" << kernel_key.library_type_ << "]";
+  return os;
+}
+
+inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
+  std::ostringstream stream;
+  stream << kernel_key;
+  return stream.str();
+}
+
+inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
+  return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r;
+}
+
+inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) {
+  return (!platform::places_are_same_class(l.place_, r.place_)) ||
+         (l.data_type_ != r.data_type_) ||
+         NeedTransformLayout(l.data_layout_, r.data_layout_);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..64096907df5a52904525ef0bf25bb9527c3a8c4b
--- /dev/null
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include <gtest/gtest.h>
+#include <iostream>
+
+TEST(OpKernelType, ToString) {
+  using OpKernelType = paddle::framework::OpKernelType;
+  using DataType = paddle::framework::proto::DataType;
+  using CPUPlace = paddle::platform::CPUPlace;
+  using DataLayout = paddle::framework::DataLayout;
+  using LibraryType = paddle::framework::LibraryType;
+
+  OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
+                              LibraryType::kCUDNN);
+
+  ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
+            "data_type[float32]:data_layout[NCHW]:place[CPUPlace]:library_type["
+            "CUDNN]");
+}
+
+TEST(OpKernelType, Hash) {
+  using OpKernelType = paddle::framework::OpKernelType;
+  using DataType = paddle::framework::proto::DataType;
+  using CPUPlace = paddle::platform::CPUPlace;
+  using CUDAPlace = paddle::platform::CUDAPlace;
+  using DataLayout = paddle::framework::DataLayout;
+  using LibraryType = paddle::framework::LibraryType;
+
+  OpKernelType op_kernel_type_1(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
+                                LibraryType::kCUDNN);
+  OpKernelType op_kernel_type_2(DataType::FP32, CUDAPlace(0), DataLayout::kNCHW,
+                                LibraryType::kCUDNN);
+
+  OpKernelType::Hash hasher;
+  ASSERT_NE(hasher(op_kernel_type_1), hasher(op_kernel_type_2));
+}
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0a779b10b49ab35dd0dbe25ac3f2bccd34fb654e
--- /dev/null
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+
+void OpProtoAndCheckerMaker::Validate() {
+  validated_ = true;
+  CheckNoDuplicatedInOutAttrs();
+}
+
+OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput(
+    const std::string& name, const std::string& comment) {
+  auto* input = proto_->add_inputs();
+  input->set_name(name);
+  input->set_comment(comment);
+  return OpProtoAndCheckerMaker::VariableBuilder{input};
+}
+
+OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
+    const std::string& name, const std::string& comment) {
+  auto* output = proto_->add_outputs();
+  output->set_name(name);
+  output->set_comment(comment);
+  return OpProtoAndCheckerMaker::VariableBuilder{output};
+}
+
+void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
+  std::unordered_set<std::string> names;
+  auto checker = [&](const std::string& name) {
+    PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name);
+    names.insert(name);
+  };
+  for (auto& attr : proto_->attrs()) {
+    checker(attr.name());
+  }
+  for (auto& input : proto_->inputs()) {
+    checker(input.name());
+  }
+  for (auto& output : proto_->outputs()) {
+    checker(output.name());
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
new file mode 100644
index 0000000000000000000000000000000000000000..1dbfc7d37be6ae79fde39434b12355a54ee648f6
--- /dev/null
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/framework.pb.h"
+
+namespace paddle {
+namespace framework {
+
+// this class not only make proto but also init attribute checkers.
+class OpProtoAndCheckerMaker {
+ public:
+  using OpProto = proto::OpProto;
+  using OpAttrChecker = framework::OpAttrChecker;
+  OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : proto_(proto), op_checker_(op_checker) {}
+
+  virtual ~OpProtoAndCheckerMaker() {
+    PADDLE_ENFORCE(validated_, "should call Validate after build");
+  }
+
+  void Validate();
+
+ protected:
+  struct VariableBuilder {
+    OpProto::Var* var_;
+
+    VariableBuilder& AsDuplicable() {
+      var_->set_duplicable(true);
+      return *this;
+    }
+
+    VariableBuilder& AsIntermediate() {
+      var_->set_intermediate(true);
+      return *this;
+    }
+
+    VariableBuilder& AsDispensable() {
+      var_->set_dispensable(true);
+      return *this;
+    }
+  };
+
+  VariableBuilder AddInput(const std::string& name, const std::string& comment);
+
+  VariableBuilder AddOutput(const std::string& name,
+                            const std::string& comment);
+
+  template <typename T>
+  TypedAttrChecker<T>& AddAttr(const std::string& name,
+                               const std::string& comment,
+                               bool generated = false) {
+    auto* attr = proto_->add_attrs();
+    attr->set_name(name);
+    attr->set_comment(comment);
+    attr->set_generated(generated);
+    attr->set_type(AttrTypeID<T>());
+    return op_checker_->AddAttrChecker<T>(name);
+  }
+
+  void AddComment(const std::string& comment) { proto_->set_comment(comment); }
+
+ private:
+  void CheckNoDuplicatedInOutAttrs();
+
+  OpProto* proto_;
+  OpAttrChecker* op_checker_;
+  bool validated_{false};
+};
+
+class NOPMaker : public OpProtoAndCheckerMaker {
+ public:
+  NOPMaker(OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {}
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cfefee8dbdead9dd0074d954fe7318baae57e8c4
--- /dev/null
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_proto_maker.h"
+
+#include "gtest/gtest.h"
+
+class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  TestAttrProtoMaker(paddle::framework::proto::OpProto* proto,
+                     paddle::framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<float>("scale", "scale of test op");
+    AddAttr<float>("scale", "scale of test op");
+  }
+};
+
+TEST(ProtoMaker, DuplicatedAttr) {
+  paddle::framework::proto::OpProto op_proto;
+  paddle::framework::OpAttrChecker op_checker;
+  auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
+  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
+}
+
+class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  TestInOutProtoMaker(paddle::framework::proto::OpProto* proto,
+                      paddle::framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of test op");
+    AddInput("input", "input of test op");
+  }
+};
+
+TEST(ProtoMaker, DuplicatedInOut) {
+  paddle::framework::proto::OpProto op_proto;
+  paddle::framework::OpAttrChecker op_checker;
+  auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
+  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
+}
diff --git a/paddle/fluid/framework/op_registry.cc b/paddle/fluid/framework/op_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..739ec72ebc17e31ab207b0e2260d7f563ceaca6e
--- /dev/null
+++ b/paddle/fluid/framework/op_registry.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+
+#include <vector>
+
+namespace paddle {
+namespace framework {
+
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
+    const std::string& type, const VariableNameMap& inputs,
+    const VariableNameMap& outputs, AttributeMap attrs) {
+  auto& info = OpInfoMap::Instance().Get(type);
+  if (info.Checker() != nullptr) {
+    info.Checker()->Check(attrs);
+  }
+  auto op = info.Creator()(type, inputs, outputs, attrs);
+  return std::unique_ptr<OperatorBase>(op);
+}
+
+static VariableNameMap ConvertOpDescVarsToVarNameMap(
+    const google::protobuf::RepeatedPtrField<proto::OpDesc::Var>&
+        op_desc_vars) {
+  VariableNameMap ret_val;
+  for (auto& var : op_desc_vars) {
+    auto& var_names = ret_val[var.parameter()];
+    auto& var_names_in_proto = var.arguments();
+    var_names.reserve(static_cast<size_t>(var_names_in_proto.size()));
+    std::copy(var_names_in_proto.begin(), var_names_in_proto.end(),
+              std::back_inserter(var_names));
+  }
+  return ret_val;
+}
+
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
+    const proto::OpDesc& op_desc) {
+  VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be"
+             "used in unit tests. Use CreateOp(const OpDesc& op_desc) "
+             "instead.";
+  VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
+  VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
+  AttributeMap attrs;
+  for (auto& attr : op_desc.attrs()) {
+    attrs[attr.name()] = GetAttrValue(attr);
+  }
+
+  return CreateOp(op_desc.type(), inputs, outputs, attrs);
+}
+
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
+  return CreateOp(op_desc.Type(), op_desc.Inputs(), op_desc.Outputs(),
+                  op_desc.GetAttrMap());
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..73faa99668ad58ddb66de515eb4750883f58bcf5
--- /dev/null
+++ b/paddle/fluid/framework/op_registry.h
@@ -0,0 +1,246 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <type_traits>
+#include <typeinfo>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "glog/logging.h"  // For VLOG()
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/details/op_registry.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/grad_op_desc_maker.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/shape_inference.h"
+
+namespace paddle {
+namespace framework {
+class Registrar {
+ public:
+  // In our design, various kinds of classes, e.g., operators and kernels,
+  // have their corresponding registry and registrar. The action of
+  // registration is in the constructor of a global registrar variable, which
+  // are not used in the code that calls package framework, and would
+  // be removed from the generated binary file by the linker. To avoid such
+  // removal, we add Touch to all registrar classes and make USE_OP macros to
+  // call this method. So, as long as the callee code calls USE_OP, the global
+  // registrar variable won't be removed by the linker.
+  void Touch() {}
+};
+
+template <typename... ARGS>
+struct OperatorRegistrar : public Registrar {
+  explicit OperatorRegistrar(const char* op_type) {
+    PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
+                   "'%s' is registered more than once.", op_type);
+    static_assert(sizeof...(ARGS) != 0,
+                  "OperatorRegistrar should be invoked at least by OpClass");
+    OpInfo info;
+    details::OperatorRegistrarRecursive<0, false, ARGS...>(op_type, &info);
+    OpInfoMap::Instance().Insert(op_type, info);
+  }
+};
+
+class OpRegistry {
+ public:
+  static std::unique_ptr<OperatorBase> CreateOp(const std::string& type,
+                                                const VariableNameMap& inputs,
+                                                const VariableNameMap& outputs,
+                                                AttributeMap attrs);
+
+  static std::unique_ptr<OperatorBase> CreateOp(const proto::OpDesc& op_desc);
+
+  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
+};
+
+template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
+struct OpKernelRegistrarFunctor;
+
+template <typename PlaceType, size_t I, typename... KernelTypes>
+struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
+  using KERNEL_TYPE =
+      typename std::tuple_element<I, std::tuple<KernelTypes...>>::type;
+
+  void operator()(const char* op_type, const char* library_type) const {
+    using T = typename KERNEL_TYPE::ELEMENT_TYPE;
+    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
+                     DataLayout::kAnyLayout, StringToLibraryType(library_type));
+    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
+
+    constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
+    OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
+        func;
+    func(op_type, library_type);
+  }
+};
+
+template <typename PlaceType, size_t I, typename... KernelType>
+struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
+  void operator()(const char* op_type, const char* library_type) const {}
+};
+
+// User can register many kernel in one place. The data type could be different.
+template <typename PlaceType, typename... KernelType>
+class OpKernelRegistrar : public Registrar {
+ public:
+  explicit OpKernelRegistrar(const char* op_type, const char* library_type) {
+    OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func;
+    func(op_type, library_type);
+  }
+};
+
+/**
+ * check if MACRO is used in GLOBAL NAMESPACE.
+ */
+#define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                        \
+  struct __test_global_namespace_##uniq_name##__ {};                          \
+  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
+                             __test_global_namespace_##uniq_name##__>::value, \
+                msg)
+
+/*
+  The variadic arguments should be class types derived from one of the
+  following classes:
+    OpProtoAndCheckerMaker
+    GradOpDescMakerBase
+    VarTypeInference
+    InferShapeBase
+*/
+#define REGISTER_OPERATOR(op_type, op_class, ...)                      \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
+      __reg_op__##op_type,                                             \
+      "REGISTER_OPERATOR must be called in global namespace");         \
+  class _OpClass_##op_type##_ : public op_class {                      \
+   public:                                                             \
+    DEFINE_OP_CLONE_METHOD(_OpClass_##op_type##_);                     \
+    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class);            \
+  };                                                                   \
+  static ::paddle::framework::OperatorRegistrar<_OpClass_##op_type##_, \
+                                                ##__VA_ARGS__>         \
+      __op_registrar_##op_type##__(#op_type);                          \
+  int TouchOpRegistrar_##op_type() {                                   \
+    __op_registrar_##op_type##__.Touch();                              \
+    return 0;                                                          \
+  }
+
+/**
+ * Macro to register Operator. When the input is duplicable, you should
+ * use REGISTER_OP_EX with drop_empty_grad=false instead.
+ */
+#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
+                    grad_op_class)                                   \
+  REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,    \
+                 grad_op_class, true)
+
+// When an argument is duplicable, we need to use this version.
+// Perhaps we can omit DropEmptyIG template parameter and
+// only have one version of REGISTER_OP.
+#define REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,       \
+                       grad_op_class, drop_empty_grad)                        \
+  REGISTER_OPERATOR(grad_op_type, grad_op_class);                             \
+  class _GradOpDescMaker_##grad_op_type##_                                    \
+      : public ::paddle::framework::DefaultGradOpDescMaker<drop_empty_grad> { \
+    using ::paddle::framework::DefaultGradOpDescMaker<                        \
+        drop_empty_grad>::DefaultGradOpDescMaker;                             \
+                                                                              \
+   protected:                                                                 \
+    virtual std::string GradOpType() const { return #grad_op_type; }          \
+  };                                                                          \
+  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_,    \
+                    op_maker_class);
+
+#define REGISTER_OP_WITH_KERNEL(op_type, ...)                         \
+  REGISTER_OPERATOR(op_type, ::paddle::framework::OperatorWithKernel, \
+                    ##__VA_ARGS__)
+
+#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
+  REGISTER_OPERATOR(op_type, op_class, op_maker_class)
+
+/**
+ * Macro to register OperatorKernel.
+ */
+#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...)        \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
+      __reg_op_kernel_##op_type##_##LIBRARY_TYPE##__,                      \
+      "REGISTER_OP_KERNEL must be called in global namespace");            \
+  static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__>  \
+      __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type,       \
+                                                           #LIBRARY_TYPE); \
+  int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() {                \
+    __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch();          \
+    return 0;                                                              \
+  }
+
+#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
+
+#define REGISTER_OP_CPU_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
+
+/**
+ * Macro to mark what Operator and Kernel
+ * we will use and tell the compiler to
+ * link them into target.
+ */
+#define USE_OP_ITSELF(op_type)                                    \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                 \
+      __use_op_itself_##op_type,                                  \
+      "USE_OP_ITSELF must be called in global namespace");        \
+  extern int TouchOpRegistrar_##op_type();                        \
+  static int use_op_itself_##op_type##_ __attribute__((unused)) = \
+      TouchOpRegistrar_##op_type()
+
+#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE)               \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                 \
+      __use_op_kernel_##op_type##_##LIBRARY_TYPE##__,             \
+      "USE_OP_DEVICE_KERNEL must be in global namespace");        \
+  extern int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE(); \
+  static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_          \
+      __attribute__((unused)) =                                   \
+          TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE()
+
+// TODO(fengjiayi): The following macros
+// seems ugly, do we have better method?
+
+#ifndef PADDLE_WITH_CUDA
+#define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
+#else
+#define USE_OP_KERNEL(op_type)        \
+  USE_OP_DEVICE_KERNEL(op_type, CPU); \
+  USE_OP_DEVICE_KERNEL(op_type, CUDA)
+#endif
+
+#define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type);
+
+#define USE_CPU_ONLY_OP(op_type) \
+  USE_OP_ITSELF(op_type);        \
+  USE_OP_DEVICE_KERNEL(op_type, CPU);
+
+#define USE_CUDA_ONLY_OP(op_type) \
+  USE_OP_ITSELF(op_type);         \
+  USE_OP_DEVICE_KERNEL(op_type, CUDA)
+
+#define USE_OP(op_type)   \
+  USE_OP_ITSELF(op_type); \
+  USE_OP_KERNEL(op_type)
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bfbb2cfc2c57c705cf42c65825edcc6dea08cf41
--- /dev/null
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -0,0 +1,373 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace pd = paddle::framework;
+
+namespace paddle {
+namespace framework {
+
+class CosineOp : public OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+  void Run(const Scope& scope, const platform::Place& place) const override {}
+};
+
+class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  CosineOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of cosine op");
+    AddOutput("output", "output of cosine op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .GreaterThan(0.0);
+    AddComment("This is cos op");
+  }
+};
+
+class MyTestOp : public OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+  void Run(const Scope& scope, const platform::Place& place) const override {}
+};
+
+class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of cosine op").AsDuplicable();
+    AddOutput("output", "output of cosine op").AsIntermediate();
+    auto my_checker = [](int i) {
+      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
+    };
+    AddAttr<int>("test_attr", "a simple test attribute")
+        .AddCustomChecker(my_checker);
+    AddComment("This is my_test op");
+  }
+};
+}  // namespace framework
+}  // namespace paddle
+
+static void BuildVar(const std::string& param_name,
+                     std::initializer_list<const char*> arguments,
+                     paddle::framework::proto::OpDesc::Var* var) {
+  var->set_parameter(param_name);
+  for (auto& arg_name : arguments) {
+    var->add_arguments(arg_name);
+  }
+}
+REGISTER_OP_WITHOUT_GRADIENT(cos_sim, paddle::framework::CosineOp,
+                             paddle::framework::CosineOpProtoAndCheckerMaker);
+REGISTER_OP_WITHOUT_GRADIENT(my_test_op, paddle::framework::MyTestOp,
+                             paddle::framework::MyTestOpProtoAndCheckerMaker);
+
+TEST(OpRegistry, CreateOp) {
+  paddle::framework::proto::OpDesc op_desc;
+  op_desc.set_type("cos_sim");
+  BuildVar("input", {"aa"}, op_desc.add_inputs());
+  BuildVar("output", {"bb"}, op_desc.add_outputs());
+
+  float scale = 3.3;
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
+  attr->set_f(scale);
+
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace cpu_place;
+  op->Run(scope, cpu_place);
+  float scale_get = op->Attr<float>("scale");
+  ASSERT_EQ(scale_get, scale);
+}
+
+TEST(OpRegistry, IllegalAttr) {
+  paddle::framework::proto::OpDesc op_desc;
+  op_desc.set_type("cos_sim");
+  BuildVar("input", {"aa"}, op_desc.add_inputs());
+  BuildVar("output", {"bb"}, op_desc.add_outputs());
+
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
+  attr->set_f(-2.0);
+
+  bool caught = false;
+  try {
+    paddle::framework::OpRegistry::CreateOp(op_desc);
+  } catch (paddle::platform::EnforceNotMet err) {
+    caught = true;
+    std::string msg = "larger_than check fail";
+    const char* err_msg = err.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(err_msg[i], msg[i]);
+    }
+  }
+  ASSERT_TRUE(caught);
+}
+
+TEST(OpRegistry, DefaultValue) {
+  paddle::framework::proto::OpDesc op_desc;
+  op_desc.set_type("cos_sim");
+  BuildVar("input", {"aa"}, op_desc.add_inputs());
+  BuildVar("output", {"bb"}, op_desc.add_outputs());
+
+  ASSERT_TRUE(op_desc.IsInitialized());
+
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace cpu_place;
+  op->Run(scope, cpu_place);
+  ASSERT_EQ(op->Attr<float>("scale"), 1.0);
+}
+
+TEST(OpRegistry, CustomChecker) {
+  paddle::framework::proto::OpDesc op_desc;
+  op_desc.set_type("my_test_op");
+  BuildVar("input", {"ii"}, op_desc.add_inputs());
+  BuildVar("output", {"oo"}, op_desc.add_outputs());
+
+  // attr 'test_attr' is not set
+  bool caught = false;
+  try {
+    paddle::framework::OpRegistry::CreateOp(op_desc);
+  } catch (paddle::platform::EnforceNotMet err) {
+    caught = true;
+    std::string msg = "Attribute 'test_attr' is required!";
+    const char* err_msg = err.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(err_msg[i], msg[i]);
+    }
+  }
+  ASSERT_TRUE(caught);
+
+  // set 'test_attr' set to an illegal value
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("test_attr");
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(3);
+  caught = false;
+  try {
+    paddle::framework::OpRegistry::CreateOp(op_desc);
+  } catch (paddle::platform::EnforceNotMet err) {
+    caught = true;
+    std::string msg = "'test_attr' must be even!";
+    const char* err_msg = err.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(err_msg[i], msg[i]);
+    }
+  }
+  ASSERT_TRUE(caught);
+
+  // set 'test_attr' set to a legal value
+  op_desc.mutable_attrs()->Clear();
+  attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("test_attr");
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(4);
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+  op->Run(scope, cpu_place);
+  int test_attr = op->Attr<int>("test_attr");
+  ASSERT_EQ(test_attr, 4);
+}
+
+class CosineOpComplete : public paddle::framework::CosineOp {
+ public:
+  DEFINE_OP_CONSTRUCTOR(CosineOpComplete, paddle::framework::CosineOp);
+  DEFINE_OP_CLONE_METHOD(CosineOpComplete);
+};
+
+TEST(OperatorRegistrar, Test) {
+  using namespace paddle::framework;
+  OperatorRegistrar<CosineOpComplete, CosineOpProtoAndCheckerMaker> reg("cos");
+}
+
+namespace paddle {
+namespace framework {
+
+class OpKernelTestMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddComment("NoGradOp, same input output. no Grad");
+  }
+};
+
+class OpWithKernelTest : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(InferShapeContext* ctx) const override {}
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(proto::DataType::FP32, ctx.device_context());
+  }
+};
+
+template <typename DeviceContext, typename T>
+class OpKernelTest : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {}
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(op_with_kernel,
+                             paddle::framework::OpWithKernelTest,
+                             paddle::framework::OpKernelTestMaker);
+REGISTER_OP_CPU_KERNEL(
+    op_with_kernel,
+    paddle::framework::OpKernelTest<paddle::platform::CPUDeviceContext, float>);
+
+REGISTER_OP_CUDA_KERNEL(op_with_kernel,
+                        paddle::framework::OpKernelTest<
+                            paddle::platform::CUDADeviceContext, float>);
+
+TEST(OperatorRegistrar, CPU) {
+  paddle::framework::proto::OpDesc op_desc;
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+
+  op_desc.set_type("op_with_kernel");
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+
+  op->Run(scope, cpu_place);
+}
+
+TEST(OperatorRegistrar, CUDA) {
+  paddle::framework::proto::OpDesc op_desc;
+  paddle::platform::CUDAPlace cuda_place(0);
+  paddle::framework::Scope scope;
+
+  op_desc.set_type("op_with_kernel");
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+
+  op->Run(scope, cuda_place);
+}
+
+static int op_test_value = 0;
+
+using paddle::platform::DeviceContext;
+using paddle::platform::CPUDeviceContext;
+using paddle::platform::CUDADeviceContext;
+
+namespace paddle {
+namespace framework {
+
+class OpWithMultiKernelTest : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(InferShapeContext* ctx) const override {}
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        proto::DataType::FP32, platform::CUDAPlace(0), DataLayout::kAnyLayout,
+        framework::LibraryType::kCUDNN);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class OpMultiKernelTest : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const;
+};
+
+template <typename T>
+class OpMultiKernelTest<CPUDeviceContext, T>
+    : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {
+    ++op_test_value;
+  }
+};
+
+template <typename T>
+class OpMultiKernelTest<CUDADeviceContext, T>
+    : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {
+    --op_test_value;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class OpMultiKernelTest2 : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const;
+};
+
+template <typename T>
+class OpMultiKernelTest2<CPUDeviceContext, T>
+    : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {
+    op_test_value += 10;
+  }
+};
+
+template <typename T>
+class OpMultiKernelTest2<CUDADeviceContext, T>
+    : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {
+    op_test_value -= 10;
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(op_with_multi_kernel,
+                             paddle::framework::OpWithMultiKernelTest,
+                             paddle::framework::OpKernelTestMaker);
+REGISTER_OP_KERNEL(
+    op_with_multi_kernel, CPU, paddle::platform::CPUPlace,
+    paddle::framework::OpMultiKernelTest<CPUDeviceContext, float>);
+REGISTER_OP_KERNEL(
+    op_with_multi_kernel, MKLDNN, paddle::platform::CPUPlace,
+    paddle::framework::OpMultiKernelTest2<CPUDeviceContext, float>);
+REGISTER_OP_KERNEL(
+    op_with_multi_kernel, CUDA, paddle::platform::CUDAPlace,
+    paddle::framework::OpMultiKernelTest<CUDADeviceContext, float>);
+REGISTER_OP_KERNEL(
+    op_with_multi_kernel, CUDNN, paddle::platform::CUDAPlace,
+    paddle::framework::OpMultiKernelTest2<CUDADeviceContext, float>);
+
+TEST(OperatorRegistrar, OpWithMultiKernel) {
+  paddle::framework::proto::OpDesc op_desc;
+  paddle::platform::CUDAPlace cuda_place(0);
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+
+  op_desc.set_type("op_with_multi_kernel");
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+
+  // TODO(qiao) add priority back
+  // use all available kernels
+  op->Run(scope, cuda_place);
+  EXPECT_EQ(op_test_value, -10);
+}
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..61529fe38b15fe2a4bfa0d64159994d6b62fb086
--- /dev/null
+++ b/paddle/fluid/framework/operator.cc
@@ -0,0 +1,601 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include <algorithm>
+
+#include "paddle/fluid/framework/data_transform.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/var_type.h"
+
+DECLARE_bool(benchmark);
+
+namespace paddle {
+namespace framework {
+
+std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
+    std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
+    std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
+    std::make_tuple(platform::CPUPlace(), LibraryType::kMKLDNN),
+    std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
+};
+
+static DDim GetDims(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  if (var == nullptr) {
+    return DDim({-1});
+  }
+
+  if (var->IsType<LoDTensor>()) {
+    return var->Get<LoDTensor>().dims();
+  } else if (var->IsType<SelectedRows>()) {
+    return var->Get<SelectedRows>().GetCompleteDims();
+  } else {
+    return DDim({-1});
+  }
+}
+
+static LoD GetLoD(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  auto default_lod = LoD({{}});
+
+  if (var == nullptr) {
+    return default_lod;
+  }
+
+  if (var->IsType<LoDTensor>()) {
+    return var->Get<LoDTensor>().lod();
+  } else {
+    return default_lod;
+  }
+}
+
+std::string OperatorBase::Input(const std::string& name) const {
+  auto& ins = Inputs(name);
+  PADDLE_ENFORCE_LE(ins.size(), 1UL,
+                    "Operator %s's input %s should contain only one variable.",
+                    type_, name);
+  return ins.empty() ? kEmptyVarName : ins[0];
+}
+
+const std::vector<std::string>& OperatorBase::Inputs(
+    const std::string& name) const {
+  auto it = inputs_.find(name);
+  PADDLE_ENFORCE(it != inputs_.end(), "Operator %s does not have the input %s.",
+                 type_, name);
+  return it->second;
+}
+
+std::string OperatorBase::Output(const std::string& name) const {
+  auto& outs = Outputs(name);
+  PADDLE_ENFORCE_LE(outs.size(), 1UL,
+                    "Operator %s's output %s should contain only one variable.",
+                    type_, name);
+  return outs.empty() ? kEmptyVarName : outs[0];
+}
+
+const std::vector<std::string>& OperatorBase::Outputs(
+    const std::string& name) const {
+  auto it = outputs_.find(name);
+  PADDLE_ENFORCE(it != outputs_.end(),
+                 "Operator %s does not have an output called %s.", type_, name);
+  return it->second;
+}
+
+std::string OperatorBase::DebugStringEx(const Scope* scope) const {
+  std::stringstream ss;
+  ss << "Op(" << type_ << "), inputs:{";
+  for (auto it = inputs_.begin(); it != inputs_.end();) {
+    auto& input = *it;
+    ss << input.first << "[";
+    for (size_t i = 0; i < input.second.size(); ++i) {
+      ss << input.second[i];
+      if (scope) {
+        ss << "[" << GetDims(*scope, input.second[i]) << "]";
+        ss << "(" << GetLoD(*scope, input.second[i]) << ")";
+      }
+      if (i != input.second.size() - 1) {
+        ss << ", ";
+      }
+    }
+    ss << "]";
+    ++it;
+    if (it != inputs_.end()) {
+      ss << ", ";
+    }
+  }
+  ss << "}, outputs:{";
+  for (auto it = outputs_.begin(); it != outputs_.end();) {
+    auto& output = *it;
+    ss << output.first << "[";
+    for (size_t i = 0; i < output.second.size(); ++i) {
+      ss << output.second[i];
+      if (scope) {
+        ss << "[" << GetDims(*scope, output.second[i]) << "]";
+        ss << "(" << GetLoD(*scope, output.second[i]) << ")";
+      }
+      if (i != output.second.size() - 1) {
+        ss << ", ";
+      }
+    }
+    ss << "]";
+    ++it;
+    if (it != outputs_.end()) {
+      ss << ", ";
+    }
+  }
+  ss << "}.";
+  return ss.str();
+}
+
+void OperatorBase::Rename(const std::string& old_name,
+                          const std::string& new_name) {
+  for (auto& input : inputs_) {
+    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
+  }
+  for (auto& output : outputs_) {
+    std::replace(output.second.begin(), output.second.end(), old_name,
+                 new_name);
+  }
+}
+
+OperatorBase::OperatorBase(const std::string& type,
+                           const VariableNameMap& inputs,
+                           const VariableNameMap& outputs,
+                           const AttributeMap& attrs)
+    : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) {
+  GenerateTemporaryNames();
+  CheckAllInputOutputSet();
+}
+
+std::vector<std::string> OperatorBase::InputVars() const {
+  std::vector<std::string> ret_val;
+  for (auto& o : inputs_) {
+    ret_val.reserve(ret_val.size() + o.second.size());
+    ret_val.insert(ret_val.end(), o.second.begin(), o.second.end());
+  }
+  return ret_val;
+}
+
+std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
+  std::vector<std::string> ret_val;
+  if (has_intermediate) {
+    // push all outputs into ret_val
+    for (auto& o : outputs_) {
+      ret_val.reserve(ret_val.size() + o.second.size());
+      ret_val.insert(ret_val.end(), o.second.begin(), o.second.end());
+    }
+    return ret_val;
+  }
+  auto& info = OpInfoMap::Instance().Get(Type());
+
+  // get all OpProto::Var for outputs
+  for (auto& o : info.Proto().outputs()) {
+    // ignore all intermediate output
+    if (o.intermediate()) continue;
+    auto out = outputs_.find(o.name());
+    if (out != outputs_.end()) {
+      ret_val.reserve(ret_val.size() + out->second.size());
+      ret_val.insert(ret_val.end(), out->second.begin(), out->second.end());
+    }
+  }
+  return ret_val;
+}
+
+void OperatorBase::CheckAllInputOutputSet() const {
+  auto& info_map = OpInfoMap::Instance();
+  auto* op_info = info_map.GetNullable(Type());
+  if (op_info == nullptr || op_info->proto_ == nullptr) return;
+
+  for (auto& in : op_info->Proto().inputs()) {
+    PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
+                   "Type %s's input %s is not set", Type(), in.name());
+  }
+
+  for (auto& out : op_info->Proto().outputs()) {
+    PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
+                   "Type %s's output %s is not set", Type(), out.name());
+  }
+}
+
+void OperatorBase::GenerateTemporaryNames() {
+  static std::atomic<size_t> gUniqId(0UL);
+  for (auto& output : outputs_) {
+    for (auto& output_name : output.second) {
+      if (output_name == kTempVarName) {
+        output_name += type_;
+        output_name += "@";
+        output_name += std::to_string(gUniqId.fetch_add(1));
+      }
+    }
+  }
+}
+
+static bool VarIsTensor(const Variable* var) {
+  return var->IsType<LoDTensor>() || var->IsType<SelectedRows>();
+}
+
+static const Tensor* GetTensorFromVar(Variable* var) {
+  if (var->IsType<LoDTensor>()) {
+    return var->GetMutable<LoDTensor>();
+  } else if (var->IsType<SelectedRows>()) {
+    return var->GetMutable<SelectedRows>()->mutable_value();
+  } else {
+    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
+                 var->Type().name());
+  }
+}
+
+static Tensor* GetMutableTensorFromVar(Variable* var) {
+  if (var->IsType<LoDTensor>()) {
+    return var->GetMutable<LoDTensor>();
+  } else if (var->IsType<SelectedRows>()) {
+    return var->GetMutable<SelectedRows>()->mutable_value();
+  } else {
+    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
+                 var->Type().name());
+  }
+}
+
+template <>
+const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
+  auto* var = InputVar(name);
+  return var == nullptr ? nullptr
+                        : GetTensorFromVar(const_cast<Variable*>(var));
+}
+
+template <>
+const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
+    const std::string& name) const {
+  auto names = op().Inputs(name);
+  std::vector<const Tensor*> res;
+  res.reserve(names.size());
+  std::transform(names.begin(), names.end(), std::back_inserter(res),
+                 [&](const std::string& sub_name) {
+                   auto var = scope_.FindVar(sub_name);
+                   return var == nullptr ? nullptr : GetTensorFromVar(var);
+                 });
+  return res;
+}
+
+template <>
+Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
+  auto var = OutputVar(name);
+  return var == nullptr ? nullptr : GetMutableTensorFromVar(var);
+}
+
+template <>
+std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
+    const std::string& name) const {
+  auto names = op().Outputs(name);
+  std::vector<Tensor*> res;
+  res.reserve(names.size());
+  std::transform(names.begin(), names.end(), std::back_inserter(res),
+                 [&](const std::string& sub_name) {
+                   auto var = scope_.FindVar(sub_name);
+                   return var == nullptr ? nullptr
+                                         : GetMutableTensorFromVar(var);
+                 });
+  return res;
+}
+
+bool OpSupportGPU(const std::string& op_type) {
+  auto& all_kernels = OperatorWithKernel::AllOpKernels();
+  auto it = all_kernels.find(op_type);
+  if (it == all_kernels.end()) {
+    // All control operator must support GPU
+
+    return true;
+  }
+  for (auto& kern_pair : it->second) {
+    if (platform::is_gpu_place(kern_pair.first.place_)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+class RuntimeInferShapeContext : public InferShapeContext {
+ public:
+  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
+      : op_(op), scope_(scope) {}
+
+  bool HasInput(const std::string& name) const override {
+    auto& ins = Inputs(name);
+    size_t length = ins.size();
+    if (length == 0) {
+      return false;
+    }
+    PADDLE_ENFORCE_EQ(length, 1UL,
+                      "Input %s should not have more than one inputs", name);
+    auto ipt = ins[0];
+    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
+    return var != nullptr;
+  }
+
+  bool HasOutput(const std::string& name) const override {
+    auto& outs = Outputs(name);
+    size_t length = outs.size();
+    if (length == 0) {
+      return false;
+    }
+    PADDLE_ENFORCE_EQ(length, 1UL,
+                      "Output %s should not have more than one inputs", name);
+    auto ipt = outs[0];
+    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
+    return var != nullptr;
+  }
+
+  bool HasInputs(const std::string& name) const override {
+    auto inputs = op_.Inputs(name);
+    if (inputs.empty()) {
+      return false;
+    }
+    for (auto& input : inputs) {
+      if (scope_.FindVar(input) == nullptr) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool HasOutputs(const std::string& name) const override {
+    auto outputs = op_.Outputs(name);
+    if (outputs.empty()) {
+      return false;
+    }
+    for (auto& output : outputs) {
+      if (scope_.FindVar(output) == nullptr) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
+
+  const std::vector<std::string>& Inputs(
+      const std::string& name) const override {
+    return op_.Inputs(name);
+  }
+
+  const std::vector<std::string>& Outputs(
+      const std::string& name) const override {
+    return op_.Outputs(name);
+  }
+
+  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) const override {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
+    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
+    if (!in_var->IsType<LoDTensor>()) return;
+    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+    auto in_tensor = in_var->Get<LoDTensor>();
+    auto* out_tensor = out_var->GetMutable<LoDTensor>();
+    out_tensor->set_lod(in_tensor.lod());
+
+    // TODO(dzhwinter) : reuse ShareLoD in most operators.
+    // Need to call ShareLayout explicitly in sequence related ops.
+    // Shall we have a better method to shared info between in/out Tensor?
+    out_tensor->set_layout(in_tensor.layout());
+  }
+
+  void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
+                   size_t j = 0) const {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
+    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
+    if (!in_var->IsType<LoDTensor>()) return;
+    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+    auto in_tensor = in_var->Get<LoDTensor>();
+    auto* out_tensor = out_var->GetMutable<LoDTensor>();
+    out_tensor->set_layout(in_tensor.layout());
+  }
+
+  bool IsRuntime() const override { return true; }
+
+ protected:
+  DDim GetDim(const std::string& name) const override {
+    Variable* var = scope_.FindVar(name);
+    if (var->IsType<LoDTensor>()) {
+      return var->Get<LoDTensor>().dims();
+    } else if (var->IsType<SelectedRows>()) {
+      return var->Get<SelectedRows>().GetCompleteDims();
+    } else {
+      PADDLE_THROW(
+          "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's "
+          "type_id is %s.",
+          name, var->Type().name());
+    }
+  }
+
+  std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
+    Variable* var = scope_.FindVar(name);
+    if (var->IsType<ReaderHolder>()) {
+      return var->Get<ReaderHolder>().shapes();
+    } else {
+      PADDLE_THROW(
+          "Only ReaderHolder support 'GetRepeatedDims', but Variable %s's "
+          "type_id is %s.",
+          name, var->Type().name());
+    }
+  }
+
+  void SetDim(const std::string& name, const DDim& dim) override {
+    Variable* var = scope_.FindVar(name);
+    if (var->IsType<LoDTensor>()) {
+      var->GetMutable<LoDTensor>()->Resize(dim);
+    } else if (var->IsType<SelectedRows>()) {
+      var->GetMutable<SelectedRows>()->set_height(dim[0]);
+    } else {
+      PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
+                   name, var->Type().name());
+    }
+  }
+
+  void SetRepeatedDims(const std::string& name,
+                       const std::vector<DDim>& dims) override {
+    Variable* var = scope_.FindVar(name);
+    if (var->IsType<ReaderHolder>()) {
+      var->GetMutable<ReaderHolder>()->set_shapes(dims);
+    } else {
+      PADDLE_THROW(
+          "Only ReaderHolder support 'SetRepeatedDims', but Variable %s's "
+          "type_id is %s.",
+          name, var->Type().name());
+    }
+  }
+
+  proto::VarDesc::VarType GetVarType(const std::string& name) const override {
+    auto* var = scope_.FindVar(name);
+    return ToVarType(var->Type());
+  }
+
+  InferShapeVarPtr GetVarPtr(const std::string& name) override {
+    return scope_.FindVar(name);
+  }
+
+ private:
+  const OperatorBase& op_;
+  const Scope& scope_;
+};
+
+void OperatorWithKernel::Run(const Scope& scope,
+                             const platform::Place& place) const {
+  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
+  this->InferShape(&infer_shape_ctx);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto dev_ctx = pool.Get(place);
+
+  // check if op[type] has kernel registered.
+  auto& all_op_kernels = AllOpKernels();
+  auto kernels_iter = all_op_kernels.find(type_);
+  if (kernels_iter == all_op_kernels.end()) {
+    PADDLE_THROW(
+        "There are no kernels which are registered in the %s operator.", type_);
+  }
+
+  ExecutionContext ctx(*this, scope, *dev_ctx);
+
+  OpKernelMap& kernels = kernels_iter->second;
+
+  // TODO(dzhwinter) : kernel fallback mechanism will be added when all the
+  // transform functions are ready.
+
+  // for (auto& candidate : kKernelPriority) {
+  //   Do selection
+  // }
+
+  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
+  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+
+  auto kernel_iter = kernels.find(expected_kernel_key);
+  if (kernel_iter == kernels.end()) {
+    PADDLE_THROW("op %s does not have kernel for %s", type_,
+                 KernelTypeToString(expected_kernel_key));
+  }
+
+  // do data transform
+  Scope& new_scope = scope.NewScope();
+
+  for (auto& var_name_item : this->Inputs()) {
+    for (auto& var_name : var_name_item.second) {
+      auto* var = scope.FindVar(var_name);
+      if (var && VarIsTensor(var)) {
+        auto* tensor_in = GetTensorFromVar(var);
+        if (tensor_in->IsInitialized()) {
+          auto kernel_type_for_var = this->GetKernelTypeForVar(
+              var_name_item.first, *tensor_in, expected_kernel_key);
+          if (TransFromNeeded(kernel_type_for_var, expected_kernel_key)) {
+            auto out_var_names = OutputVars(true);
+            if (std::find(out_var_names.begin(), out_var_names.end(),
+                          var_name) != out_var_names.end()) {
+              PADDLE_THROW(
+                  "var %s is both input and output, "
+                  "does not support transform",
+                  var_name);
+            }
+            VLOG(3) << "Transform Variable " << var_name << " from "
+                    << kernel_type_for_var << " to " << expected_kernel_key;
+            auto* trans_var = new_scope.Var(var_name);
+            std::shared_ptr<Tensor> out(new Tensor);
+            DataTransform(expected_kernel_key, kernel_type_for_var, *tensor_in,
+                          out.get());
+            CopyVariableWithTensor(*var, *(out.get()), *trans_var);
+          }
+        }
+      }
+    }
+  }
+
+  auto* new_dev_ctx = pool.Get(expected_kernel_key.place_);
+  kernel_iter->second->Compute(
+      ExecutionContext(*this, new_scope, *new_dev_ctx));
+
+  /*For profiling/benchmark only*/
+  if (FLAGS_benchmark) {
+    new_dev_ctx->Wait();
+  }
+}
+
+proto::DataType OperatorWithKernel::IndicateDataType(
+    const ExecutionContext& ctx) const {
+  auto& scope = ctx.scope();
+  int data_type = -1;
+  for (auto& input : this->inputs_) {
+    for (auto& ipt_name : input.second) {
+      auto* var = scope.FindVar(ipt_name);
+      if (var != nullptr) {
+        const Tensor* t = nullptr;
+        if (var->IsType<Tensor>()) {
+          t = &var->Get<Tensor>();
+        } else if (var->IsType<LoDTensor>()) {
+          t = &var->Get<LoDTensor>();
+        } else if (var->IsType<SelectedRows>()) {
+          t = &(var->Get<SelectedRows>().value());
+        }
+        if (t != nullptr) {
+          int tmp = static_cast<int>(ToDataType(t->type()));
+          PADDLE_ENFORCE(tmp == data_type || data_type == -1,
+                         "DataType of Paddle Op %s must be the same.", Type());
+          data_type = tmp;
+        }
+      }
+    }
+  }
+  PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
+  return static_cast<proto::DataType>(data_type);
+}
+
+OpKernelType OperatorWithKernel::GetExpectedKernelType(
+    const ExecutionContext& ctx) const {
+  return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
+}
+
+OpKernelType OperatorWithKernel::GetKernelTypeForVar(
+    const std::string& var_name, const Tensor& tensor,
+    const OpKernelType& expected_kernel_type) const {
+  return OpKernelType(expected_kernel_type.data_type_, tensor.place());
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..52300abeb7df346d610d2363335dc9d3330ee39e
--- /dev/null
+++ b/paddle/fluid/framework/operator.h
@@ -0,0 +1,401 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+#include "glog/logging.h"  // For VLOG
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/variant.h"
+#include "paddle/utils/Error.h"
+
+namespace paddle {
+namespace framework {
+
+/// If a variable is a empty variable, that name will be used.
+constexpr char kEmptyVarName[] = "@EMPTY@";
+
+/// If a variable is a temporary variable, that name will be set in Python,
+/// but it will be convert to a unique name in scope after OpCreator.
+constexpr char kTempVarName[] = "@TEMP@";
+
+/// If a variable's name has a certain suffix, it means that the
+/// variable is the gradient of another varibale.
+/// e.g. Variable "x@GRAD" is the gradient of varibale "x".
+constexpr char kGradVarSuffix[] = "@GRAD";
+
+/// Variables with this suffix are supposed to be filled up with zeros.
+constexpr char kZeroVarSuffix[] = "@ZERO";
+
+// define some kernel priority
+/* Define multiple kernel type fallback order*/
+extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
+
+inline std::string GradVarName(const std::string& var_name) {
+  return var_name + kGradVarSuffix;
+}
+
+class OperatorBase;
+class ExecutionContext;
+
+/**
+ * OperatorBase has the basic element that Net will call to do computation.
+ * Only CreateOperator from OpRegistry will new Operator directly. User
+ * should always construct a proto message OpDesc and call
+ * OpRegistry::CreateOp(op_desc) to get an Operator instance.
+ */
+class OperatorBase {
+ public:
+  OperatorBase(const std::string& type, const VariableNameMap& inputs,
+               const VariableNameMap& outputs, const AttributeMap& attrs);
+
+  virtual ~OperatorBase() {}
+
+  template <typename T>
+  inline const T& Attr(const std::string& name) const {
+    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
+                   name);
+    return boost::get<T>(attrs_.at(name));
+  }
+
+  /// if scope is not null, also show dimensions of arguments
+  virtual std::string DebugStringEx(const Scope* scope) const;
+
+  std::string DebugString() const { return DebugStringEx(nullptr); }
+
+  /// Net will call this function to Run an op.
+  virtual void Run(const Scope& scope, const platform::Place& place) const = 0;
+
+  // FIXME(typhoonzero): this is only used for recv_op to stop event_loop.
+  virtual void Stop() {}
+
+  virtual bool IsNetOp() const { return false; }
+
+  virtual bool SupportGPU() const { return false; }
+
+  /// rename inputs outputs name
+  void Rename(const std::string& old_name, const std::string& new_name);
+
+  const VariableNameMap& Inputs() const { return inputs_; }
+  const VariableNameMap& Outputs() const { return outputs_; }
+
+  //! Get a input with argument's name described in `op_proto`
+  std::string Input(const std::string& name) const;
+  //! Get a input which has multiple variables.
+  const std::vector<std::string>& Inputs(const std::string& name) const;
+
+  std::vector<std::string> InputVars() const;
+
+  //! Get a output with argument's name described in `op_proto`
+  std::string Output(const std::string& name) const;
+  //! Get an output which has multiple variables.
+  //! TODO add a vector_view to prevent memory copy.
+  const std::vector<std::string>& Outputs(const std::string& name) const;
+
+  virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
+
+  const std::string& Type() const { return type_; }
+  void SetType(const std::string& type) { type_ = type; }
+  const AttributeMap& Attrs() const { return attrs_; }
+
+  // Return a new operator instance, which is as same as this.
+  // Use unique_ptr to prevent caller forget to delete this pointer.
+  virtual std::unique_ptr<OperatorBase> Clone() const = 0;
+
+ protected:
+  std::string type_;
+  // NOTE: in case of OpGrad, inputs_ contains:
+  // I (Inputs)
+  // O (Outputs)
+  // OG (Output Gradients)
+  VariableNameMap inputs_;
+
+  // NOTE: in case of OpGrad, outputs_ contains
+  // IG (Inputs Gradients)
+  VariableNameMap outputs_;
+  AttributeMap attrs_;
+
+ private:
+  void GenerateTemporaryNames();
+  void CheckAllInputOutputSet() const;
+};
+
+// Macro for define a clone method.
+// If you are writing an kernel operator, `Clone` will be defined when you
+// register it. i.e. `Clone` method is not needed to define by yourself.
+#define DEFINE_OP_CLONE_METHOD(cls)                                            \
+  std::unique_ptr<::paddle::framework::OperatorBase> Clone() const final {     \
+    return std::unique_ptr<::paddle::framework::OperatorBase>(new cls(*this)); \
+  }
+
+// Macro for define a default constructor for Operator.
+// You can also use
+//   using PARENT_CLASS::PARENT_CLASS;
+// to use parent's constructor.
+#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls)             \
+  cls(const std::string& type,                             \
+      const ::paddle::framework::VariableNameMap& inputs,  \
+      const ::paddle::framework::VariableNameMap& outputs, \
+      const paddle::framework::AttributeMap& attrs)        \
+      : parent_cls(type, inputs, outputs, attrs) {}
+
+class NOP : public OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+  void Run(const Scope& scope, const platform::Place& place) const override {}
+  std::unique_ptr<OperatorBase> Clone() const override {
+    return std::unique_ptr<OperatorBase>(new NOP(*this));
+  }
+};
+
+class ExecutionContext {
+ public:
+  ExecutionContext(const OperatorBase& op, const Scope& scope,
+                   const platform::DeviceContext& device_context)
+      : op_(op), scope_(scope), device_context_(device_context) {}
+
+  const OperatorBase& op() const { return op_; }
+
+  const Scope& scope() const { return scope_; }
+
+  template <typename T>
+  inline const T& Attr(const std::string& name) const {
+    return op_.Attr<T>(name);
+  }
+
+  size_t InputSize(const std::string& name) const {
+    return op_.Inputs(name).size();
+  }
+
+  size_t OutputSize(const std::string& name) const {
+    return op_.Outputs(name).size();
+  }
+
+  const Variable* InputVar(const std::string& name) const {
+    auto ipt = op_.Input(name);
+    return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
+  }
+
+  Variable* OutputVar(const std::string& name) const {
+    auto opt = op_.Output(name);
+    return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt);
+  }
+
+  const std::vector<const Variable*> MultiInputVar(
+      const std::string& name) const {
+    auto names = op_.Inputs(name);
+    std::vector<const Variable*> res;
+    res.reserve(names.size());
+    std::transform(names.begin(), names.end(), std::back_inserter(res),
+                   [this](const std::string& name) {
+                     return name == kEmptyVarName ? nullptr
+                                                  : scope_.FindVar(name);
+                   });
+    return res;
+  }
+
+  std::vector<Variable*> MultiOutputVar(const std::string& name) const {
+    auto names = op_.Outputs(name);
+    std::vector<Variable*> res;
+    res.reserve(names.size());
+    std::transform(names.begin(), names.end(), std::back_inserter(res),
+                   [this](const std::string& name) {
+                     return name == kEmptyVarName ? nullptr
+                                                  : scope_.FindVar(name);
+                   });
+    return res;
+  }
+
+  template <typename T>
+  const T* Input(const std::string& name) const {
+    auto* var = InputVar(name);
+    return var == nullptr ? nullptr : &var->Get<T>();
+  }
+
+  template <typename T>
+  T* Output(const std::string& name) const {
+    auto var = OutputVar(name);
+    return var == nullptr ? nullptr : var->GetMutable<T>();
+  }
+
+  template <typename T>
+  const std::vector<const T*> MultiInput(const std::string& name) const {
+    auto names = op_.Inputs(name);
+    std::vector<const T*> res;
+    res.reserve(names.size());
+    std::transform(names.begin(), names.end(), std::back_inserter(res),
+                   [&](const std::string& sub_name) {
+                     auto var = scope_.FindVar(sub_name);
+                     return var == nullptr ? nullptr : &var->Get<T>();
+                   });
+    return res;
+  }
+
+  template <typename T>
+  std::vector<T*> MultiOutput(const std::string& name) const {
+    auto names = op_.Outputs(name);
+    std::vector<T*> res;
+    res.reserve(names.size());
+    std::transform(names.begin(), names.end(), std::back_inserter(res),
+                   [&](const std::string& sub_name) {
+                     auto var = scope_.FindVar(sub_name);
+                     return var == nullptr ? nullptr : var->GetMutable<T>();
+                   });
+    return res;
+  }
+
+  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) const {
+    PADDLE_ENFORCE_LT(i, InputSize(in));
+    PADDLE_ENFORCE_LT(j, OutputSize(out));
+    auto* in_var = MultiInputVar(in)[i];
+    auto* out_var = MultiOutputVar(out)[j];
+    if (!in_var->IsType<LoDTensor>()) return;
+    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+    auto in_tensor = in_var->Get<LoDTensor>();
+    auto* out_tensor = out_var->GetMutable<LoDTensor>();
+    out_tensor->set_lod(in_tensor.lod());
+  }
+
+  platform::Place GetPlace() const { return device_context_.GetPlace(); }
+
+  template <typename DeviceContextType>
+  const DeviceContextType& device_context() const {
+    return *reinterpret_cast<const DeviceContextType*>(&device_context_);
+  }
+
+  const platform::DeviceContext& device_context() const {
+    return device_context_;
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  const inline platform::CUDADeviceContext& cuda_device_context() const {
+    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
+    return *reinterpret_cast<const platform::CUDADeviceContext*>(
+        &device_context_);
+  }
+#endif
+
+  //! Get actual name vector for this input.
+  const std::vector<std::string>& Inputs(const std::string& name) const {
+    return op_.Inputs(name);
+  }
+
+  //! Get actual name vector for this output.
+  const std::vector<std::string>& Outputs(const std::string& name) const {
+    return op_.Outputs(name);
+  }
+
+ private:
+  const OperatorBase& op_;
+  const Scope& scope_;
+  const platform::DeviceContext& device_context_;
+};
+
+template <>
+const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const;
+
+template <>
+const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
+    const std::string& name) const;
+
+template <>
+Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
+
+template <>
+std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
+    const std::string& name) const;
+
+class OpKernelBase {
+ public:
+  /**
+   * ExecutionContext is the only parameter of Kernel Run function.
+   * Run will get input/output variables, state such as momentum and
+   * device resource such as CUDA stream, cublas handle, etc. from
+   * ExecutionContext. User should construct it before run the Operator.
+   */
+
+  virtual void Compute(const ExecutionContext& context) const = 0;
+
+  virtual ~OpKernelBase() = default;
+};
+
+template <typename T>
+class OpKernel : public OpKernelBase {
+ public:
+  using ELEMENT_TYPE = T;
+};
+
+class OperatorWithKernel : public OperatorBase {
+ public:
+  using OpKernelMap =
+      std::unordered_map<OpKernelType, std::unique_ptr<OpKernelBase>,
+                         OpKernelType::Hash>;
+
+  OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
+                     const VariableNameMap& outputs, const AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const Scope& scope, const platform::Place& place) const final;
+
+  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
+  AllOpKernels() {
+    static std::unordered_map<std::string, OpKernelMap> g_all_op_kernels;
+    return g_all_op_kernels;
+  }
+
+  bool SupportGPU() const override {
+    auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
+    return std::any_of(op_kernels.begin(), op_kernels.end(),
+                       [](OpKernelMap::const_reference kern_pair) {
+                         return platform::is_gpu_place(kern_pair.first.place_);
+                       });
+  }
+
+  virtual void InferShape(InferShapeContext* ctx) const {
+    OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
+  }
+
+ protected:
+  virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
+  virtual OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const OpKernelType& expected_kernel_type) const;
+
+ private:
+  // indicate kernel DataType by input data. Defaultly all input data must be
+  // same.
+  proto::DataType IndicateDataType(const ExecutionContext& ctx) const;
+};
+
+extern bool OpSupportGPU(const std::string& op_type);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b90f5538bb620275521cdc11bf47b4014b2a66e2
--- /dev/null
+++ b/paddle/fluid/framework/operator_test.cc
@@ -0,0 +1,273 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+
+static int op_run_num = 0;
+
+class OpWithoutKernelTest : public OperatorBase {
+ public:
+  OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
+                      const VariableNameMap& outputs, const AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs), x(1) {}
+  void Run(const Scope& scope, const platform::Place& place) const override {
+    ++op_run_num;
+    ASSERT_EQ(static_cast<int>(inputs_.size()), 1);
+    ASSERT_EQ(static_cast<int>(outputs_.size()), 1);
+    ASSERT_EQ(scope.FindVar(inputs_.at("input")[0]), nullptr);
+    ASSERT_EQ(x, 1);
+    ASSERT_NE(scope.FindVar(outputs_.at("output")[0]), nullptr);
+  }
+
+ public:
+  int x{0};
+};
+
+class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpWithoutKernelCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of test op");
+    AddOutput("output", "output of test op");
+    AddAttr<float>("scale", "scale of cosine op");
+    AddComment("This is test op");
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+static void BuildVar(const std::string& param_name,
+                     std::initializer_list<const char*> arguments,
+                     paddle::framework::proto::OpDesc::Var* var) {
+  var->set_parameter(param_name);
+  for (auto& arg_name : arguments) {
+    *var->mutable_arguments()->Add() = arg_name;
+  }
+}
+
+REGISTER_OP_WITHOUT_GRADIENT(test_operator,
+                             paddle::framework::OpWithoutKernelTest,
+                             paddle::framework::OpWithoutKernelCheckerMaker);
+
+TEST(OperatorBase, all) {
+  paddle::framework::InitDevices();
+  paddle::framework::proto::OpDesc op_desc;
+  op_desc.set_type("test_operator");
+  BuildVar("input", {"IN1"}, op_desc.add_inputs());
+  BuildVar("output", {"OUT1"}, op_desc.add_outputs());
+
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
+  attr->set_f(3.14);
+
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  scope.Var("OUT1");
+  ASSERT_EQ(paddle::framework::op_run_num, 0);
+  op->Run(scope, cpu_place);
+  ASSERT_EQ(paddle::framework::op_run_num, 1);
+}
+
+namespace paddle {
+namespace framework {
+
+class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("x", "input of test op");
+    AddOutput("y", "output of test op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .GreaterThan(0.0);
+    AddComment("This is test op");
+  }
+};
+
+static int cpu_kernel_run_num = 0;
+
+class OpWithKernelTest : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+  OpKernelType GetExpectedKernelType(
+      const ExecutionContext& ctx) const override {
+    return OpKernelType(proto::DataType::FP32, ctx.GetPlace());
+  }
+};
+
+template <typename T1, typename T2>
+class CPUKernelTest : public OpKernel<float> {
+ public:
+  void Compute(const ExecutionContext& ctx) const {
+    std::cout << ctx.op().DebugString() << std::endl;
+    cpu_kernel_run_num++;
+    ASSERT_EQ(ctx.op().Input("x"), "IN1");
+    ASSERT_EQ(ctx.op().Output("y"), "OUT1");
+  }
+};
+
+class OpKernelTestMultiInputsProtoAndCheckerMaker
+    : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestMultiInputsProtoAndCheckerMaker(OpProto* proto,
+                                              OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("xs", "inputs of test op").AsDuplicable();
+    AddInput("k", "input of test op");
+    AddOutput("ys", "outputs of test op").AsDuplicable();
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .GreaterThan(0.0);
+    AddComment("This is test op");
+  }
+};
+
+class CPUKernalMultiInputsTest : public OpKernel<float> {
+ public:
+  void Compute(const ExecutionContext& ctx) const {
+    auto xs = ctx.op().Inputs("xs");
+    ASSERT_EQ(xs.size(), 3UL);
+    ASSERT_EQ(xs[0], "x0");
+    ASSERT_EQ(xs[1], "x1");
+    ASSERT_EQ(xs[2], "x2");
+
+    auto inVar0 = ctx.MultiInputVar("xs");
+    ASSERT_EQ(inVar0.size(), 3U);
+
+    auto intVar1 = ctx.InputVar("k");
+    ASSERT_NE(intVar1, nullptr);
+
+    auto outVar0 = ctx.MultiOutputVar("ys");
+    ASSERT_EQ(outVar0.size(), 2U);
+
+    auto inTensor0 = ctx.MultiInput<Tensor>("xs");
+    ASSERT_EQ(inTensor0.size(), 3U);
+
+    auto intTensor1 = ctx.Input<Tensor>("k");
+    ASSERT_NE(intTensor1, nullptr);
+
+    auto outTensor0 = ctx.MultiOutput<Tensor>("ys");
+    ASSERT_EQ(outTensor0.size(), 2U);
+
+    auto k = ctx.op().Input("k");
+    ASSERT_EQ(k, "k0");
+
+    auto ys = ctx.op().Outputs("ys");
+    ASSERT_EQ(ys.size(), 2UL);
+    ASSERT_EQ(ys[0], "y0");
+    ASSERT_EQ(ys[1], "y1");
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(
+    op_with_kernel, paddle::framework::OpWithKernelTest,
+    paddle::framework::OpKernelTestProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(op_with_kernel,
+                       paddle::framework::CPUKernelTest<float, float>);
+
+// test with single input
+TEST(OpKernel, all) {
+  paddle::framework::InitDevices();
+  paddle::framework::proto::OpDesc op_desc;
+  op_desc.set_type("op_with_kernel");
+  BuildVar("x", {"IN1"}, op_desc.add_inputs());
+  BuildVar("y", {"OUT1"}, op_desc.add_outputs());
+
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
+  attr->set_f(3.14);
+
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
+  op->Run(scope, cpu_place);
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
+}
+
+REGISTER_OP_WITHOUT_GRADIENT(
+    op_multi_inputs_with_kernel, paddle::framework::OpWithKernelTest,
+    paddle::framework::OpKernelTestMultiInputsProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
+                       paddle::framework::CPUKernalMultiInputsTest);
+
+// test with multi inputs
+TEST(OpKernel, multi_inputs) {
+  using namespace paddle::framework;
+
+  paddle::framework::InitDevices();
+  proto::OpDesc op_desc;
+
+  op_desc.set_type("op_multi_inputs_with_kernel");
+  BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs());
+  BuildVar("k", {"k0"}, op_desc.add_inputs());
+  BuildVar("ys", {"y0", "y1"}, op_desc.add_outputs());
+
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
+  attr->set_f(3.14);
+
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+  scope.Var("x0")->GetMutable<LoDTensor>();
+  scope.Var("x1")->GetMutable<LoDTensor>();
+  scope.Var("x2")->GetMutable<LoDTensor>();
+  scope.Var("k0")->GetMutable<LoDTensor>();
+  scope.Var("y0")->GetMutable<LoDTensor>();
+  scope.Var("y1")->GetMutable<LoDTensor>();
+
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  op->Run(scope, cpu_place);
+}
+
+class OperatorClone : public paddle::framework::OperatorBase {
+ public:
+  DEFINE_OP_CLONE_METHOD(OperatorClone);
+  OperatorClone(const std::string& type,
+                const paddle::framework::VariableNameMap& inputs,
+                const paddle::framework::VariableNameMap& outputs,
+                const paddle::framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const paddle::framework::Scope& scope,
+           const paddle::platform::Place& place) const override {}
+};
+
+TEST(Operator, Clone) {
+  paddle::framework::InitDevices();
+  OperatorClone a("ABC", paddle::framework::VariableNameMap{},
+                  paddle::framework::VariableNameMap{},
+                  paddle::framework::AttributeMap{});
+  auto b = a.Clone();
+  ASSERT_EQ(a.Type(), b->Type());
+}
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b3f2e97cd954bd55ab1a8c9def6938c877a79449
--- /dev/null
+++ b/paddle/fluid/framework/program_desc.cc
@@ -0,0 +1,110 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+
+namespace paddle {
+namespace framework {
+
+BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
+  auto *b = desc_.add_blocks();
+  b->set_parent_idx(parent.ID());
+  b->set_idx(desc_.blocks_size() - 1);
+  blocks_.emplace_back(new BlockDesc(this, b));
+  return blocks_.back().get();
+}
+
+proto::ProgramDesc *ProgramDesc::Proto() {
+  for (auto &block : blocks_) {
+    block->Flush();
+  }
+  return &desc_;
+}
+
+ProgramDesc::ProgramDesc() {
+  auto *block = desc_.mutable_blocks()->Add();
+  block->set_idx(kRootBlockIndex);
+  block->set_parent_idx(kNoneBlockIndex);
+  blocks_.emplace_back(new BlockDesc(this, block));
+}
+
+ProgramDesc::ProgramDesc(const ProgramDesc &o) {
+  desc_ = o.desc_;
+  for (int i = 0; i < desc_.blocks_size(); ++i) {
+    auto *block = desc_.mutable_blocks(i);
+    blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
+  }
+  for (auto &block : blocks_) {
+    for (auto *op : block->AllOps()) {
+      for (const auto &attr : op->Proto()->attrs()) {
+        if (attr.type() == proto::AttrType::BLOCK) {
+          size_t blk_idx = attr.block_idx();
+          op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx));
+        }
+      }
+    }
+  }
+}
+
+ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
+  desc_ = desc;
+  for (auto &block_desc : *desc_.mutable_blocks()) {
+    blocks_.emplace_back(new BlockDesc(this, &block_desc));
+  }
+  for (auto &block : blocks_) {
+    for (auto *op : block->AllOps()) {
+      for (const auto &attr : op->Proto()->attrs()) {
+        if (attr.type() == proto::AttrType::BLOCK) {
+          size_t blk_idx = attr.block_idx();
+          op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx));
+        }
+      }
+    }
+  }
+}
+
+ProgramDesc::ProgramDesc(const std::string &binary_str) {
+  PADDLE_ENFORCE(desc_.ParseFromString(binary_str),
+                 "Fail to parse program_desc from binary string.");
+  for (auto &block_desc : *desc_.mutable_blocks()) {
+    blocks_.emplace_back(new BlockDesc(this, &block_desc));
+  }
+}
+
+const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
+  BlockDesc *global_block = blocks_[0].get();
+  std::vector<std::string> feed_target_names;
+  for (auto *op : global_block->AllOps()) {
+    if (op->Type() == kFeedOpType) {
+      feed_target_names.insert(feed_target_names.begin(), op->Output("Out")[0]);
+    }
+  }
+  return feed_target_names;
+}
+
+const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
+  BlockDesc *global_block = blocks_[0].get();
+  std::vector<std::string> fetch_target_names;
+  for (auto *op : global_block->AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      fetch_target_names.push_back(op->Input("X")[0]);
+    }
+  }
+  return fetch_target_names;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..937de6ba9270a275e5d4e020fe5f2e7f5ef63557
--- /dev/null
+++ b/paddle/fluid/framework/program_desc.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/proto_desc.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace framework {
+
+class BlockDesc;
+
+class ProgramDesc {
+ public:
+  ProgramDesc();
+
+  explicit ProgramDesc(const proto::ProgramDesc &desc);
+
+  ProgramDesc(const ProgramDesc &o);
+
+  explicit ProgramDesc(const std::string &binary_str);
+
+  BlockDesc *AppendBlock(const BlockDesc &parent);
+
+  BlockDesc *MutableBlock(size_t idx) { return blocks_[idx].get(); }
+
+  const BlockDesc &Block(size_t idx) const { return *blocks_[idx]; }
+
+  size_t Size() const { return blocks_.size(); }
+
+  proto::ProgramDesc *Proto();
+
+  const std::vector<std::string> GetFeedTargetNames();
+  const std::vector<std::string> GetFetchTargetNames();
+
+ private:
+  proto::ProgramDesc desc_;
+
+  std::vector<std::unique_ptr<BlockDesc>> blocks_;
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..afd5c9dabfbb0dab2832300dedc378ef617d8e81
--- /dev/null
+++ b/paddle/fluid/framework/program_desc_test.cc
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/program_desc.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/block_desc.h"
+
+namespace paddle {
+namespace framework {
+TEST(ProgramDesc, copy_ctor) {
+  ProgramDesc program;
+  auto* global_block = program.MutableBlock(0);
+  auto* x = global_block->Var("X");
+  x->SetType(proto::VarDesc_VarType_LOD_TENSOR);
+  x->SetLoDLevel(0);
+  x->SetDataType(proto::FP32);
+  x->SetShape({1000, 784});
+
+  auto* y = global_block->Var("Y");
+  y->SetType(proto::VarDesc_VarType_LOD_TENSOR);
+  y->SetLoDLevel(0);
+  y->SetDataType(proto::FP32);
+  y->SetShape({784, 100});
+
+  auto* op = global_block->AppendOp();
+  op->SetType("mul");
+  op->SetInput("X", {x->Name()});
+  op->SetInput("Y", {y->Name()});
+
+  auto* out = global_block->Var("Out");
+  out->SetType(proto::VarDesc_VarType_LOD_TENSOR);
+  op->SetOutput("Y", {out->Name()});
+
+  ProgramDesc program_copy(program);
+
+  auto* global_block_copy = program_copy.MutableBlock(0);
+  ASSERT_NE(global_block, global_block_copy);
+
+  auto assert_same_var = [&](const std::string& name, VarDesc* var_before) {
+    ASSERT_TRUE(global_block_copy->HasVar(name));
+    auto* copy = global_block_copy->Var(name);
+    ASSERT_NE(copy, var_before);
+    ASSERT_EQ(copy->Name(), var_before->Name());
+    ASSERT_EQ(copy->GetType(), var_before->GetType());
+    ASSERT_EQ(copy->GetShape(), var_before->GetShape());
+    ASSERT_EQ(copy->Proto()->SerializeAsString(),
+              var_before->Proto()->SerializeAsString());
+  };
+
+  ASSERT_EQ(global_block->LocalVarNames(), global_block_copy->LocalVarNames());
+  ASSERT_EQ(3UL, global_block_copy->LocalVarNames().size());
+  assert_same_var("X", x);
+  assert_same_var("Y", y);
+  assert_same_var("Out", out);
+
+  for (size_t i = 0; i < global_block->OpSize(); ++i) {
+    auto op_origin = global_block->Op(i);
+    auto op_copy = global_block->Op(i);
+
+    ASSERT_EQ(op_origin->Type(), op_copy->Type());
+    ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs());
+    ASSERT_EQ(op_origin->Outputs(), op_copy->Outputs());
+
+    ASSERT_EQ(op_copy->Proto()->SerializeAsString(),
+              op_origin->Proto()->SerializeAsString());
+  }
+
+  // Not check block's protostr are same it because the order of vars could be
+  // different and it is correct.
+}
+
+TEST(ProgramDescBind, serialize_and_deserialize) {
+  ProgramDesc program_origin;
+  auto* global_block = program_origin.MutableBlock(0);
+  auto* x = global_block->Var("X");
+  x->SetType(proto::VarDesc_VarType_LOD_TENSOR);
+  x->SetLoDLevel(0);
+  x->SetDataType(proto::FP32);
+  x->SetShape({1000, 784});
+
+  auto* y = global_block->Var("Y");
+  y->SetType(proto::VarDesc_VarType_LOD_TENSOR);
+  y->SetLoDLevel(0);
+  y->SetDataType(proto::FP32);
+  y->SetShape({784, 100});
+
+  auto* op = global_block->AppendOp();
+  op->SetType("mul");
+  op->SetInput("X", {x->Name()});
+  op->SetInput("Y", {y->Name()});
+
+  auto* out = global_block->Var("Out");
+  out->SetType(proto::VarDesc_VarType_LOD_TENSOR);
+  op->SetOutput("Y", {out->Name()});
+
+  std::string binary_str;
+  program_origin.Proto()->SerializeToString(&binary_str);
+
+  ProgramDesc program_restored(binary_str);
+  auto* global_block_restored = program_restored.MutableBlock(0);
+  ASSERT_NE(global_block, global_block_restored);
+
+  auto assert_same_var = [&](const std::string& name, VarDesc* var_before) {
+    ASSERT_TRUE(global_block_restored->HasVar(name));
+    auto* restored = global_block_restored->Var(name);
+    ASSERT_NE(restored, var_before);
+    ASSERT_EQ(restored->Name(), var_before->Name());
+    ASSERT_EQ(restored->GetType(), var_before->GetType());
+    ASSERT_EQ(restored->GetShape(), var_before->GetShape());
+    ASSERT_EQ(restored->Proto()->SerializeAsString(),
+              var_before->Proto()->SerializeAsString());
+  };
+
+  ASSERT_EQ(global_block->LocalVarNames(),
+            global_block_restored->LocalVarNames());
+  ASSERT_EQ(3UL, global_block_restored->LocalVarNames().size());
+  assert_same_var("X", x);
+  assert_same_var("Y", y);
+  assert_same_var("Out", out);
+
+  for (size_t i = 0; i < global_block->OpSize(); ++i) {
+    auto op_origin = global_block->Op(i);
+    auto op_restored = global_block->Op(i);
+
+    ASSERT_EQ(op_origin->Type(), op_restored->Type());
+    ASSERT_EQ(op_origin->Inputs(), op_restored->Inputs());
+    ASSERT_EQ(op_origin->Outputs(), op_restored->Outputs());
+
+    ASSERT_EQ(op_restored->Proto()->SerializeAsString(),
+              op_origin->Proto()->SerializeAsString());
+  }
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/proto_desc.h b/paddle/fluid/framework/proto_desc.h
similarity index 100%
rename from paddle/framework/proto_desc.h
rename to paddle/fluid/framework/proto_desc.h
diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc
new file mode 100644
index 0000000000000000000000000000000000000000..79dbd3bcab4124d3aa765c8ede174c9fb3de689b
--- /dev/null
+++ b/paddle/fluid/framework/prune.cc
@@ -0,0 +1,212 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/prune.h"
+
+#include <algorithm>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <glog/logging.h>
+
+namespace paddle {
+namespace framework {
+
+const std::string kFeedOpType = "feed";
+const std::string kFetchOpType = "fetch";
+const std::string kDropOutOpType = "dropout";
+const std::string kBatchNormOpType = "batch_norm";
+
+bool HasDependentVar(const proto::OpDesc& op_desc,
+                     const std::set<std::string>& dependent_vars) {
+  for (auto& var : op_desc.outputs()) {
+    for (auto& argu : var.arguments()) {
+      if (dependent_vars.count(argu) != 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool IsTarget(const proto::OpDesc& op_desc) {
+  if (op_desc.has_is_target()) {
+    return op_desc.is_target();
+  }
+  return false;
+}
+
+int GetSubBlockIndex(const proto::OpDesc& op_desc) {
+  for (auto& attr : op_desc.attrs()) {
+    if (attr.type() == proto::AttrType::BLOCK) {
+      PADDLE_ENFORCE(attr.has_block_idx());
+      return attr.block_idx();
+    }
+  }
+  return -1;
+}
+
+bool HasSubBlock(const proto::OpDesc& op_desc) {
+  return GetSubBlockIndex(op_desc) > 0;
+}
+
+// block_id is the idx of the current block in the input desc
+// parent_block_id is the idx of the parent of the current block
+// in the output desc, -1 means the current block is global block
+// dependent_vars is passed recursively from the parent block to
+// the child block to help pruning
+void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
+                int block_id, int parent_block_id,
+                std::set<std::string>& dependent_vars) {
+  auto& block = input.blocks(block_id);
+  auto& ops = block.ops();
+
+  bool expect_feed = true;
+  for (auto& op_desc : ops) {
+    PADDLE_ENFORCE(op_desc.type() != kFeedOpType || expect_feed,
+                   "All FeedOps are at the beginning of the ProgramDesc");
+    expect_feed = (op_desc.type() == kFeedOpType);
+  }
+
+  bool expect_fetch = true;
+  for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
+    auto& op_desc = *op_iter;
+    PADDLE_ENFORCE(op_desc.type() != kFetchOpType || expect_fetch,
+                   "All FetchOps must at the end of the ProgramDesc");
+    expect_fetch = (op_desc.type() == kFetchOpType);
+  }
+
+  std::vector<bool> should_run;
+  for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
+    auto& op_desc = *op_iter;
+    if (IsTarget(op_desc) || HasDependentVar(op_desc, dependent_vars)) {
+      // insert its input to the dependency graph
+      for (auto& var : op_desc.inputs()) {
+        for (auto& argu : var.arguments()) {
+          dependent_vars.insert(argu);
+        }
+      }
+      should_run.push_back(true);
+    } else {
+      should_run.push_back(false);
+    }
+  }
+
+  // since we are traversing the ProgramDesc in reverse order
+  // we reverse the should_run vector
+  std::reverse(should_run.begin(), should_run.end());
+
+  // copy the current block from input to output
+  auto* block_field = output->mutable_blocks();
+  *block_field->Add() = input.blocks(block_id);
+
+  int output_block_id = output->blocks_size() - 1;
+  auto* output_block = output->mutable_blocks(output_block_id);
+  output_block->set_idx(output_block_id);
+  output_block->set_parent_idx(parent_block_id);
+
+  auto* op_field = output_block->mutable_ops();
+  op_field->Clear();
+  for (size_t i = 0; i < should_run.size(); ++i) {
+    if (should_run[i]) {
+      auto* op = op_field->Add();
+      *op = input.blocks(block_id).ops(i);
+      if (HasSubBlock(*op)) {
+        // create sub_block_dependent_vars here to help prune the sub block
+        std::set<std::string> sub_block_dependent_vars;
+        for (auto& var : op->inputs()) {
+          for (auto& argu : var.arguments()) {
+            sub_block_dependent_vars.insert(argu);
+          }
+        }
+        for (auto& var : op->outputs()) {
+          for (auto& argu : var.arguments()) {
+            sub_block_dependent_vars.insert(argu);
+          }
+        }
+        // GetSubBlockIndex(*op) is the idx of the sub_block in the input desc
+        // output_block_id is the idx of the current block in the output desc
+        prune_impl(input, output, GetSubBlockIndex(*op), output_block_id,
+                   sub_block_dependent_vars);
+      }
+    }
+  }
+
+  // remove the VarDescs in BlockDesc that are not referenced in
+  // the pruned OpDescs
+  std::unordered_map<std::string, proto::VarDesc> var_map;
+  auto* var_field = output->mutable_blocks(output_block_id)->mutable_vars();
+  for (const auto& var : *var_field) {
+    var_map[var.name()] = var;
+  }
+
+  std::set<std::string> var_names;
+  for (const auto& op : *op_field) {
+    auto& input_field = op.inputs();
+    for (auto& input_var : input_field) {
+      for (auto& arg : input_var.arguments()) {
+        if (var_map.count(arg) != 0) {
+          var_names.insert(arg);
+        }
+      }
+    }
+    auto& output_field = op.outputs();
+    for (auto& output_var : output_field) {
+      for (auto& arg : output_var.arguments()) {
+        if (var_map.count(arg) != 0) {
+          var_names.insert(arg);
+        }
+      }
+    }
+  }
+
+  var_field->Clear();
+  for (const auto& name : var_names) {
+    *var_field->Add() = var_map[name];
+  }
+}
+
+// TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
+void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) {
+  std::set<std::string> dependent_vars;
+  output->clear_blocks();
+  prune_impl(input, output, 0, -1, dependent_vars);
+}
+
+void inference_optimize_impl(const proto::ProgramDesc& input,
+                             proto::ProgramDesc* output, int block_id) {
+  *output = input;
+  auto* op_field = output->mutable_blocks(block_id)->mutable_ops();
+  for (auto& op_desc : *op_field) {
+    if (op_desc.type() == kDropOutOpType ||
+        op_desc.type() == kBatchNormOpType) {
+      for (auto& attr : *op_desc.mutable_attrs()) {
+        if (attr.name() == "is_test") {
+          attr.set_b(true);
+          break;
+        }
+      }
+    }
+  }
+}
+
+void InferenceOptimize(const proto::ProgramDesc& input,
+                       proto::ProgramDesc* output) {
+  inference_optimize_impl(input, output, 0);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/prune.h b/paddle/fluid/framework/prune.h
new file mode 100644
index 0000000000000000000000000000000000000000..601e66b67a77b615e43fe74e72935b1622e59965
--- /dev/null
+++ b/paddle/fluid/framework/prune.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output);
+
+void InferenceOptimize(const proto::ProgramDesc& input,
+                       proto::ProgramDesc* output);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/prune_test.cc b/paddle/fluid/framework/prune_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..36b76f0763ec2bab861adf86b60093c5e3c4b9e2
--- /dev/null
+++ b/paddle/fluid/framework/prune_test.cc
@@ -0,0 +1,152 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/prune.h"
+
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/net_op.h"
+
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+#include <gtest/gtest.h>
+
+namespace f = paddle::framework;
+namespace ops = paddle::operators;
+
+void AddOp(const std::string &type, const f::VariableNameMap &inputs,
+           const f::VariableNameMap &outputs, f::AttributeMap attrs,
+           paddle::framework::BlockDesc *block) {
+  // insert output
+  for (auto kv : outputs) {
+    for (auto v : kv.second) {
+      auto var = block->Var(v);
+      var->SetDataType(paddle::framework::proto::DataType::FP32);
+    }
+  }
+
+  // insert op
+  auto op = block->AppendOp();
+  op->SetType(type);
+  for (auto &kv : inputs) {
+    op->SetInput(kv.first, kv.second);
+  }
+  for (auto &kv : outputs) {
+    op->SetOutput(kv.first, kv.second);
+  }
+  op->SetAttrMap(attrs);
+}
+
+TEST(Prune, one_operator) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+
+  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
+        block);
+
+  f::proto::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc pruned;
+
+  f::Prune(*pdesc, &pruned);
+  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 0);
+
+  pdesc->mutable_blocks(0)->mutable_ops(0)->set_is_target(true);
+  f::Prune(*pdesc, &pruned);
+  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 1);
+}
+
+TEST(Prune, forward) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+
+  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"c"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"d"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, f::AttributeMap{},
+        block);
+
+  f::proto::ProgramDesc *pdesc = program.Proto();
+
+  for (int i = 0; i < pdesc->blocks(0).ops_size(); ++i) {
+    f::proto::ProgramDesc pruned;
+    pdesc->mutable_blocks(0)->mutable_ops(i)->set_is_target(true);
+    f::Prune(*pdesc, &pruned);
+    PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), i + 1);
+  }
+}
+
+TEST(Prune, multi_input_op) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+
+  AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"a1"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"a2"}}}, {{"output", {"b2"}}}, f::AttributeMap{},
+        block);
+  AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}},
+        f::AttributeMap{}, block);
+
+  f::proto::ProgramDesc *pdesc = program.Proto();
+  pdesc->mutable_blocks(0)->mutable_ops(3)->set_is_target(true);
+
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
+  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 4);
+}
+
+TEST(Prune, multi_output_op) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+
+  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
+        f::AttributeMap{}, block);
+  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
+        block);
+
+  f::proto::ProgramDesc *pdesc = program.Proto();
+  pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);
+
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
+  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 2);
+}
+
+TEST(Prune, multi_target) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+
+  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
+        f::AttributeMap{}, block);
+  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
+        block);
+
+  f::proto::ProgramDesc *pdesc = program.Proto();
+  pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
+  pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);
+
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
+  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 3);
+}
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1ef0c4821110a259fd20469e736b93f44a80f90a
--- /dev/null
+++ b/paddle/fluid/framework/reader.cc
@@ -0,0 +1,116 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/reader.h"
+
+namespace paddle {
+namespace framework {
+
+DDim ReaderBase::shape(size_t idx) const {
+  PADDLE_ENFORCE_LT(
+      idx, shapes_.size(),
+      "Cannot get the %d'th shape, 'shapes_' only has %d elements.", idx,
+      shapes_.size());
+  return shapes_[idx];
+}
+
+void ShuffleReader::ReadNext(std::vector<LoDTensor>* out) {
+  if (iteration_pos_ >= buffer_.size()) {
+    // Reload buffer with new data
+    buffer_.clear();
+    buffer_.reserve(buffer_size_);
+    for (int i = 0; i < buffer_size_; ++i) {
+      if (reader_->HasNext()) {
+        buffer_.push_back(std::vector<LoDTensor>());
+        reader_->ReadNext(&buffer_.back());
+      } else {
+        break;
+      }
+    }
+    // TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
+    // optimize.
+    std::random_shuffle(buffer_.begin(), buffer_.end());
+    iteration_pos_ = 0;
+  }
+  out->clear();
+  if (!buffer_.empty()) {
+    std::swap(*out, buffer_[iteration_pos_++]);
+  }
+  // if buffer_ is empty, the 'out' will return as an empty vector.
+}
+
+void BatchReader::ReadNext(std::vector<LoDTensor>* out) {
+  buffer_.clear();
+  buffer_.reserve(batch_size_);
+  for (int i = 0; i < batch_size_; ++i) {
+    if (reader_->HasNext()) {
+      buffer_.push_back(std::vector<LoDTensor>());
+      reader_->ReadNext(&buffer_.back());
+    } else {
+      break;
+    }
+  }
+  // Concat instances
+  out->clear();
+  if (buffer_.empty()) {
+    // if buffer_ is empty, the 'out' will return as an empty vector.
+    return;
+  }
+  int out_num = buffer_[0].size();
+  out->reserve(out_num);
+  for (int j = 0; j < out_num; ++j) {
+    // Merge shape and check date type
+    std::type_index batch_type = buffer_[0][j].type();
+    DDim batch_shape = buffer_[0][j].dims();
+    for (size_t i = 1; i < buffer_.size(); ++i) {
+      std::type_index ins_type = buffer_[i][j].type();
+      DDim ins_shape = buffer_[i][j].dims();
+      PADDLE_ENFORCE_EQ(batch_type, ins_type);
+      PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()),
+                        slice_ddim(ins_shape, 1, ins_shape.size()));
+      PADDLE_ENFORCE_GT(ins_shape[0], 0);
+      batch_shape[0] += ins_shape[0];
+    }
+
+    LoDTensor out_tensor;
+    out_tensor.Resize(batch_shape);
+    out_tensor.mutable_data(platform::CPUPlace(), batch_type);
+    int64_t dst_offset = 0;
+
+    // Merge lod and data
+    LoD batch_lod;
+    for (size_t i = 0; i < buffer_.size(); ++i) {
+      DDim ins_shape = buffer_[i][j].dims();
+      LoD ins_lod = buffer_[i][j].lod();
+      if (i == 0) {
+        batch_lod = ins_lod;
+      } else {
+        PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size());
+        for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) {
+          auto& lod_level = batch_lod[level_idx];
+          for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) {
+            lod_level.push_back(ins_lod[level_idx][k] + lod_level.back());
+          }
+        }
+      }
+      Tensor dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]);
+      Copy(buffer_[i][j], platform::CPUPlace(), &dst);
+      dst_offset += ins_shape[0];
+    }
+    out_tensor.set_lod(batch_lod);
+    out->push_back(out_tensor);
+  }
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a5eba5fb733b3e9da2b245b4dda18725c9b0895
--- /dev/null
+++ b/paddle/fluid/framework/reader.h
@@ -0,0 +1,161 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+
+namespace paddle {
+namespace framework {
+
+class ReaderBase {
+ public:
+  explicit ReaderBase(const std::vector<DDim>& shapes) : shapes_(shapes) {
+    PADDLE_ENFORCE(!shapes_.empty());
+  }
+  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
+  virtual bool HasNext() const = 0;
+
+  virtual void ReInit() = 0;
+
+  DDim shape(size_t idx) const;
+  std::vector<DDim> shapes() const { return shapes_; }
+  void set_shapes(const std::vector<DDim>& shapes) { shapes_ = shapes; }
+
+  virtual ~ReaderBase() {}
+
+ protected:
+  std::vector<DDim> shapes_;
+};
+
+class FileReader : public ReaderBase {
+ public:
+  explicit FileReader(const std::vector<DDim>& shapes) : ReaderBase(shapes) {}
+};
+
+class DecoratedReader : public ReaderBase {
+ public:
+  explicit DecoratedReader(ReaderBase* reader)
+      : ReaderBase(reader->shapes()), reader_(reader) {
+    PADDLE_ENFORCE_NOT_NULL(reader_);
+  }
+
+  bool HasNext() const override { return reader_->HasNext(); }
+
+  void ReInit() override { reader_->ReInit(); }
+
+ protected:
+  ReaderBase* reader_;
+};
+
+// file readers
+
+template <typename T>
+class RandomDataGenerator : public FileReader {
+ public:
+  RandomDataGenerator(const std::vector<DDim>& shapes, float min, float max)
+      : FileReader(shapes), min_(min), max_(max) {
+    PADDLE_ENFORCE_LE(
+        min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max);
+    unsigned int seed = std::random_device()();
+    engine_.seed(seed);
+    dist_ = std::uniform_real_distribution<float>(min_, max_);
+  }
+
+  void ReadNext(std::vector<LoDTensor>* out) override {
+    out->clear();
+    out->reserve(shapes_.size());
+    for (const DDim& shape : shapes_) {
+      PADDLE_ENFORCE_GE(
+          shape.size(), 2,
+          "The rank of reader's output data should be 2 at least.(Now it's %d)",
+          shape.size());
+      LoDTensor out_tensor;
+      out_tensor.Resize(shape);
+      T* data = out_tensor.mutable_data<T>(platform::CPUPlace());
+      int64_t numel = product(shape);
+      for (int64_t i = 0; i < numel; ++i) {
+        data[i] = dist_(engine_);
+      }
+      out->push_back(out_tensor);
+    }
+  }
+
+  bool HasNext() const override { return true; }
+
+  void ReInit() override { return; }
+
+ private:
+  float min_;
+  float max_;
+  std::minstd_rand engine_;
+  std::uniform_real_distribution<float> dist_;
+};
+
+// decorated readers
+
+class ShuffleReader : public DecoratedReader {
+ public:
+  ShuffleReader(ReaderBase* reader, int buffer_size)
+      : DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) {
+    buffer_.reserve(buffer_size);
+  }
+
+  void ReadNext(std::vector<LoDTensor>* out) override;
+
+ private:
+  int buffer_size_;
+  std::vector<std::vector<LoDTensor>> buffer_;
+  size_t iteration_pos_;
+};
+
+class BatchReader : public DecoratedReader {
+ public:
+  BatchReader(ReaderBase* reader, int batch_size)
+      : DecoratedReader(reader), batch_size_(batch_size) {
+    buffer_.reserve(batch_size_);
+  }
+
+  void ReadNext(std::vector<LoDTensor>* out) override;
+
+ private:
+  int batch_size_;
+  std::vector<std::vector<LoDTensor>> buffer_;
+};
+
+// The ReaderHolder is used as readers' unified wrapper,
+// making it easier to access different type readers in Variables.
+class ReaderHolder {
+ public:
+  void Reset(ReaderBase* reader) { reader_.reset(reader); }
+
+  ReaderBase* Get() const { return reader_.get(); }
+
+  void ReadNext(std::vector<LoDTensor>* out) { reader_->ReadNext(out); }
+  bool HasNext() const { return reader_->HasNext(); }
+  void ReInit() { reader_->ReInit(); }
+
+  DDim shape(size_t idx) const { return reader_->shape(idx); }
+  std::vector<DDim> shapes() const { return reader_->shapes(); }
+  void set_shapes(const std::vector<DDim>& shapes) {
+    reader_->set_shapes(shapes);
+  }
+
+ private:
+  std::unique_ptr<ReaderBase> reader_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6006ed16bd4a9aece5772bad58dc75c8b0847206
--- /dev/null
+++ b/paddle/fluid/framework/scope.cc
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/scope.h"
+
+#include <memory>  // for unique_ptr
+#include <mutex>   // for call_once
+#include "glog/logging.h"
+#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/string/printf.h"
+
+DEFINE_bool(benchmark, false,
+            "Doing memory benchmark. It will make deleting scope synchronized, "
+            "and add some memory usage logs."
+            "Default cuda is asynchronous device, set to True will"
+            "force op run in synchronous mode.");
+
+namespace paddle {
+namespace framework {
+
+Scope::~Scope() {
+  DropKids();
+  for (auto& kv : vars_) {
+    VLOG(3) << "Destroy variable " << kv.first;
+    delete kv.second;
+  }
+}
+
+Scope& Scope::NewScope() const {
+  kids_.push_back(new Scope(this));
+  return *kids_.back();
+}
+
+Variable* Scope::Var(const std::string& name) {
+  auto* v = FindVarLocally(name);
+  if (v != nullptr) return v;
+  v = new Variable();
+  vars_[name] = v;
+  VLOG(3) << "Create variable " << name;
+  v->name_ = &(vars_.find(name)->first);
+  return v;
+}
+
+Variable* Scope::Var(std::string* name) {
+  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
+  if (name != nullptr) {
+    *name = var_name;
+  }
+  return Var(var_name);
+}
+
+Variable* Scope::FindVar(const std::string& name) const {
+  auto var = FindVarLocally(name);
+  if (var != nullptr) {
+    return var;
+  }
+  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
+}
+
+const Scope* Scope::FindScope(const Variable* var) const {
+  for (auto& kv : vars_) {
+    if (kv.second == var) {
+      return this;
+    }
+  }
+  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+}
+void Scope::DropKids() {
+  for (Scope* s : kids_) delete s;
+  kids_.clear();
+}
+
+std::vector<std::string> Scope::LocalVarNames() const {
+  std::vector<std::string> known_vars;
+  known_vars.reserve(this->vars_.size());
+  for (auto& p : vars_) {
+    known_vars.emplace_back(p.first);
+  }
+  return known_vars;
+}
+
+void Scope::DeleteScope(Scope* scope) {
+  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
+  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
+  this->kids_.erase(it);
+  // When making memory benchmark on Fluid, we have to delete scope sync.
+  if (FLAGS_benchmark) {
+    delete scope;
+  } else {
+    Async([scope] { delete scope; });
+  }
+}
+
+void Scope::Rename(const std::string& origin_name,
+                   const std::string& new_name) const {
+  auto origin_it = vars_.find(origin_name);
+  PADDLE_ENFORCE(origin_it != vars_.end(),
+                 "Cannot find original variable with name %s", origin_name);
+  auto new_it = vars_.find(new_name);
+  PADDLE_ENFORCE(new_it == vars_.end(),
+                 "The variable with name %s is already in the scope", new_name);
+  vars_[new_name] = origin_it->second;
+  vars_.erase(origin_it);
+}
+
+std::string Scope::Rename(const std::string& origin_name) const {
+  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
+  Rename(origin_name, var_name);
+  return var_name;
+}
+
+Variable* Scope::FindVarLocally(const std::string& name) const {
+  auto it = vars_.find(name);
+  if (it != vars_.end()) return it->second;
+  return nullptr;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
new file mode 100644
index 0000000000000000000000000000000000000000..2da9e0716e7c02dc8c5397e37746344dff8e429d
--- /dev/null
+++ b/paddle/fluid/framework/scope.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <list>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace framework {
+
+class Scope;
+
+/**
+ * @brief Scope that manage all variables.
+ *
+ * Scope is an association of a name to Variable. All variables belong to
+ * Scope. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`.
+ * One net can run in different scopes and update different variable in the
+ * scope.
+ */
+class Scope {
+ public:
+  Scope() {}
+  ~Scope();
+
+  /// Create a sub-scope. Returns a reference other than a pointer so
+  /// to prevent from manual deletion.
+  /// Mark it to const because that new kid scope cannot change parent scope.
+  Scope& NewScope() const;
+
+  /// Create a variable with given name if it doesn't exist.
+  Variable* Var(const std::string& name);
+
+  /// Create a variable with a scope-unique name.
+  Variable* Var(std::string* name = nullptr);
+
+  /// Find a variable in the scope or any of its ancestors.  Returns
+  /// nullptr if cannot find.
+  Variable* FindVar(const std::string& name) const;
+
+  const Scope& parent() const { return *parent_; }
+
+  /// Find the scope or an ancestor scope that contains the given variable.
+  const Scope* FindScope(const Variable* var) const;
+
+  void DeleteScope(Scope* scope);
+
+  /// Drop all kids scopes belonged to this scope.
+  void DropKids();
+
+  // enumerate all the variables current contains.
+  std::vector<std::string> LocalVarNames() const;
+
+  // Rename variable to a new name
+  void Rename(const std::string& origin_name,
+              const std::string& new_name) const;
+
+  // Rename variable to a new name and return the new name
+  std::string Rename(const std::string& origin_name) const;
+
+  Variable* FindVarLocally(const std::string& name) const;
+
+ private:
+  // Call Scope::NewScope for a sub-scope.
+  explicit Scope(Scope const* parent) : parent_(parent) {}
+
+  mutable std::unordered_map<std::string, Variable*> vars_;
+  mutable std::list<Scope*> kids_;
+  Scope const* parent_{nullptr};
+
+  DISABLE_COPY_AND_ASSIGN(Scope);
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/scope_test.cc b/paddle/fluid/framework/scope_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d64acb130cb29eda34cb01ef0533c42f1f03dcf8
--- /dev/null
+++ b/paddle/fluid/framework/scope_test.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/scope.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+using paddle::framework::Scope;
+using paddle::framework::Variable;
+
+TEST(Scope, VarsShadowing) {
+  Scope s;
+  Scope& ss1 = s.NewScope();
+  Scope& ss2 = s.NewScope();
+
+  Variable* v0 = s.Var("a");
+  Variable* v1 = ss1.Var("a");
+
+  EXPECT_NE(v0, v1);
+
+  EXPECT_EQ(v0, s.FindVar("a"));
+  EXPECT_EQ(v1, ss1.FindVar("a"));
+  EXPECT_EQ(v0, ss2.FindVar("a"));
+}
+
+TEST(Scope, FindVar) {
+  Scope s;
+  Scope& ss = s.NewScope();
+
+  EXPECT_EQ(nullptr, s.FindVar("a"));
+  EXPECT_EQ(nullptr, ss.FindVar("a"));
+
+  ss.Var("a");
+
+  EXPECT_EQ(nullptr, s.FindVar("a"));
+  EXPECT_NE(nullptr, ss.FindVar("a"));
+}
+
+TEST(Scope, FindScope) {
+  Scope s;
+  Scope& ss = s.NewScope();
+  Variable* v = s.Var("a");
+
+  EXPECT_EQ(&s, s.FindScope(v));
+  EXPECT_EQ(&s, ss.FindScope(v));
+}
+
+TEST(Scope, GetAllNames) {
+  Scope s;
+  Variable* v = s.Var("a");
+  EXPECT_EQ(&s, s.FindScope(v));
+
+  std::vector<std::string> ans = s.LocalVarNames();
+  std::string str;
+  for (auto& var : ans) {
+    str += var;
+  }
+
+  EXPECT_STREQ("a", str.c_str());
+}
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f5d9e9a4951877e031ea6fdf529676fcb21e202f
--- /dev/null
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/selected_rows.h"
+
+namespace paddle {
+namespace framework {
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
+                       const platform::DeviceContext& dev_ctx) {
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
+  }
+  {
+    // the 2st field, rows information
+    auto& rows = selected_rows.rows();
+    uint64_t size = rows.size();
+    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    for (uint64_t i = 0; i < size; ++i) {
+      os.write(reinterpret_cast<const char*>(&rows[i]), sizeof(rows[i]));
+    }
+  }
+  {
+    // the 3st field, the height of SelectedRows
+    int64_t height = selected_rows.height();
+    os.write(reinterpret_cast<const char*>(&height), sizeof(height));
+  }
+  // the 4st field, Tensor data
+  SerializeToStream(os, selected_rows.value(), dev_ctx);
+}
+
+void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
+                           const platform::DeviceContext& dev_ctx) {
+  {
+    // the 1st field, unit32_t version for SelectedRows
+    uint32_t version;
+    is.read(reinterpret_cast<char*>(&version), sizeof(version));
+    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  }
+  {
+    // the 2st field, rows information
+    uint64_t size;
+    is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    auto& rows = *selected_rows->mutable_rows();
+    rows.resize(size);
+    for (uint64_t i = 0; i < size; ++i) {
+      is.read(reinterpret_cast<char*>(&rows[i]), sizeof(int64_t));
+    }
+  }
+  {
+    // the 3st field, the height of the SelectedRows
+    int64_t height;
+    is.read(reinterpret_cast<char*>(&height), sizeof(int64_t));
+    selected_rows->set_height(height);
+  }
+  // the 4st field, tensor which contains the data
+  DeserializeFromStream(is, selected_rows->mutable_value(), dev_ctx);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
new file mode 100644
index 0000000000000000000000000000000000000000..f1a263962b2efc1ca828dd2eeb45495334ac1047
--- /dev/null
+++ b/paddle/fluid/framework/selected_rows.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace framework {
+
+class SelectedRows {
+ public:
+  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
+      : rows_(rows), height_(height) {
+    value_.reset(new Tensor());
+  }
+
+  SelectedRows() {
+    height_ = 0;
+    value_.reset(new Tensor());
+  }
+
+  platform::Place place() const { return value_->place(); }
+
+  const Tensor& value() const { return *value_; }
+
+  Tensor* mutable_value() { return value_.get(); }
+
+  int64_t height() const { return height_; }
+
+  void set_height(int64_t height) { height_ = height; }
+
+  const Vector<int64_t>& rows() const { return rows_; }
+
+  Vector<int64_t>* mutable_rows() { return &rows_; }
+
+  void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
+
+  DDim GetCompleteDims() const {
+    std::vector<int64_t> dims = vectorize(value_->dims());
+    dims[0] = height_;
+    return make_ddim(dims);
+  }
+
+ private:
+  // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
+  // SelectedRows are simplely concated when adding together. Until a
+  // SelectedRows add a Tensor, will the duplicate rows be handled.
+  Vector<int64_t> rows_;
+  std::unique_ptr<Tensor> value_{nullptr};
+  int64_t height_;
+};
+
+/*
+ * Serialize/Desiralize SelectedRows to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
+                       const platform::DeviceContext& dev_ctx);
+void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
+                           const platform::DeviceContext& dev_ctx);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d414f2a5934282b4d586e6a9f7f81e44afbc9305
--- /dev/null
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/selected_rows.h"
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace framework {
+
+class SelectedRowsTester : public ::testing::Test {
+ public:
+  virtual void SetUp() override {
+    std::vector<int64_t> rows{0, 4, 7};
+    int64_t height = 10;
+    int64_t row_numel = 100;
+    selected_rows_.reset(new SelectedRows(rows, height));
+
+    Tensor* value = selected_rows_->mutable_value();
+    value->mutable_data<float>(
+        make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
+  }
+
+ protected:
+  platform::CPUPlace place_;
+  std::unique_ptr<SelectedRows> selected_rows_{nullptr};
+};
+
+TEST_F(SelectedRowsTester, height) { ASSERT_EQ(selected_rows_->height(), 10); }
+
+TEST_F(SelectedRowsTester, dims) {
+  ASSERT_EQ(selected_rows_->value().dims(), make_ddim({3, 100}));
+}
+
+TEST_F(SelectedRowsTester, complete_dims) {
+  ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100}));
+}
+
+TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
+  SelectedRows dst_tensor;
+  platform::CPUDeviceContext cpu_ctx(place_);
+  std::ostringstream oss;
+
+  SerializeToStream(oss, *selected_rows_, cpu_ctx);
+
+  std::istringstream iss(oss.str());
+  DeserializeFromStream(iss, &dst_tensor, cpu_ctx);
+
+  ASSERT_EQ(selected_rows_->rows(), dst_tensor.rows());
+  ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
+  ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims());
+  ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/shape_inference.cc b/paddle/fluid/framework/shape_inference.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cfd2334f1af19023c607d364172a4176be10f622
--- /dev/null
+++ b/paddle/fluid/framework/shape_inference.cc
@@ -0,0 +1,140 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/shape_inference.h"
+#include "grad_op_desc_maker.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+
+DDim InferShapeContext::GetInputDim(const std::string &name) const {
+  const std::vector<std::string> &arg_names = Inputs(name);
+  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                    "Input(%s) should hold one element, but now it holds %d",
+                    name, arg_names.size());
+  return this->GetDim(arg_names[0]);
+}
+
+std::vector<DDim> InferShapeContext::GetInputsDim(
+    const std::string &name) const {
+  const std::vector<std::string> &arg_names = Inputs(name);
+  return GetDims(arg_names);
+}
+
+std::vector<DDim> InferShapeContext::GetReaderDims(
+    const std::string &name) const {
+  const std::vector<std::string> &arg_names = Inputs(name);
+  PADDLE_ENFORCE_EQ(
+      arg_names.size(), 1UL,
+      "Reader input '%s' should hold one element, but now it holds %d", name,
+      arg_names.size());
+  return this->GetRepeatedDims(arg_names[0]);
+}
+
+DDim InferShapeContext::GetInputsElementDim(const std::string &name,
+                                            int idx) const {
+  const std::vector<std::string> &names = Inputs(name);
+  return this->GetDim(names[idx]);
+}
+
+void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) {
+  auto &arg_names = Outputs(name);
+  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                    "Output(%s) should hold one element, but now it holds %d",
+                    name, arg_names.size());
+  SetDim(arg_names[0], dim);
+}
+
+void InferShapeContext::SetOutputsDim(const std::string &name,
+                                      const std::vector<DDim> &dims) {
+  auto &names = Outputs(name);
+  SetDims(names, dims);
+}
+
+void InferShapeContext::SetReaderDims(const std::string &name,
+                                      const std::vector<DDim> &dims) {
+  const std::vector<std::string> &arg_names = Outputs(name);
+  PADDLE_ENFORCE_EQ(
+      arg_names.size(), 1UL,
+      "Reader output '%s' should hold one element, but now it holds %d", name,
+      arg_names.size());
+  return this->SetRepeatedDims(arg_names[0], dims);
+}
+
+std::vector<InferShapeVarPtr> InferShapeContext::GetInputVarPtrs(
+    const std::string &name) {
+  const std::vector<std::string> arg_names = Inputs(name);
+  std::vector<InferShapeVarPtr> res;
+  res.reserve(arg_names.size());
+  std::transform(
+      arg_names.begin(), arg_names.end(), std::back_inserter(res),
+      [this](const std::string &name) { return this->GetVarPtr(name); });
+  return res;
+}
+
+std::vector<InferShapeVarPtr> InferShapeContext::GetOutputVarPtrs(
+    const std::string &name) {
+  const std::vector<std::string> arg_names = Outputs(name);
+  std::vector<InferShapeVarPtr> res;
+  res.reserve(arg_names.size());
+  std::transform(
+      arg_names.begin(), arg_names.end(), std::back_inserter(res),
+      [this](const std::string &name) { return this->GetVarPtr(name); });
+  return res;
+}
+
+std::vector<DDim> InferShapeContext::GetDims(
+    const std::vector<std::string> &names) const {
+  std::vector<DDim> ret;
+  ret.reserve(names.size());
+  std::transform(
+      names.begin(), names.end(), std::back_inserter(ret),
+      [this](const std::string &name) { return this->GetDim(name); });
+  return ret;
+}
+
+void InferShapeContext::SetDims(const std::vector<std::string> &names,
+                                const std::vector<DDim> &dims) {
+  size_t length = names.size();
+  PADDLE_ENFORCE_EQ(length, dims.size());
+  for (size_t i = 0; i < length; ++i) {
+    if (names[i] == framework::kEmptyVarName) {
+      continue;
+    }
+    SetDim(names[i], dims[i]);
+  }
+}
+
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetInputsVarType(
+    const std::string &name) const {
+  return GetVarTypes(Inputs(name));
+}
+
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetOutputsVarType(
+    const std::string &name) const {
+  return GetVarTypes(Outputs(name));
+}
+
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetVarTypes(
+    const std::vector<std::string> &names) const {
+  std::vector<proto::VarDesc::VarType> retv;
+  retv.resize(names.size());
+  std::transform(names.begin(), names.end(), retv.begin(),
+                 std::bind(std::mem_fn(&InferShapeContext::GetVarType), this,
+                           std::placeholders::_1));
+  return retv;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
new file mode 100644
index 0000000000000000000000000000000000000000..c907523325c8472f902517deebec9bc02168713c
--- /dev/null
+++ b/paddle/fluid/framework/shape_inference.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+
+using InferShapeVarPtr = boost::variant<VarDesc *, Variable *>;
+
+class InferShapeContext {
+ public:
+  virtual ~InferShapeContext() = default;
+  virtual bool HasInput(const std::string &name) const = 0;
+  virtual bool HasOutput(const std::string &name) const = 0;
+
+  std::vector<proto::VarDesc::VarType> GetInputsVarType(
+      const std::string &name) const;
+  std::vector<proto::VarDesc::VarType> GetOutputsVarType(
+      const std::string &name) const;
+
+  virtual bool HasInputs(const std::string &name) const = 0;
+  virtual bool HasOutputs(const std::string &name) const = 0;
+
+  DDim GetInputDim(const std::string &name) const;
+  std::vector<DDim> GetInputsDim(const std::string &name) const;
+  std::vector<DDim> GetReaderDims(const std::string &name) const;
+  DDim GetInputsElementDim(const std::string &name, int idx) const;
+
+  void SetOutputDim(const std::string &name, const DDim &dim);
+  void SetOutputsDim(const std::string &name, const std::vector<DDim> &dims);
+  void SetReaderDims(const std::string &name, const std::vector<DDim> &dims);
+
+  virtual AttrReader Attrs() const = 0;
+  virtual const std::vector<std::string> &Inputs(
+      const std::string &name) const = 0;
+  virtual const std::vector<std::string> &Outputs(
+      const std::string &name) const = 0;
+
+  virtual void ShareLoD(const std::string &in, const std::string &out,
+                        size_t i = 0, size_t j = 0) const = 0;
+
+  virtual bool IsRuntime() const = 0;
+
+  std::vector<InferShapeVarPtr> GetInputVarPtrs(const std::string &name);
+  std::vector<InferShapeVarPtr> GetOutputVarPtrs(const std::string &name);
+
+  // Note: In while op, we need this to be public
+  void SetDims(const std::vector<std::string> &names,
+               const std::vector<DDim> &dims);
+
+ protected:
+  virtual DDim GetDim(const std::string &name) const = 0;
+  virtual void SetDim(const std::string &name, const DDim &dim) = 0;
+  virtual std::vector<DDim> GetRepeatedDims(const std::string &name) const = 0;
+  virtual void SetRepeatedDims(const std::string &name,
+                               const std::vector<DDim> &dims) = 0;
+
+  std::vector<DDim> GetDims(const std::vector<std::string> &names) const;
+
+  std::vector<proto::VarDesc::VarType> GetVarTypes(
+      const std::vector<std::string> &names) const;
+
+  virtual proto::VarDesc::VarType GetVarType(const std::string &name) const = 0;
+
+  virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a56091d3c629c4cedc13c465c84a646dc02cd094
--- /dev/null
+++ b/paddle/fluid/framework/tensor.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace framework {}
+}  // namespace paddle
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..44d2c7dae943a06eeab8ab1a1565f62b11de0af1
--- /dev/null
+++ b/paddle/fluid/framework/tensor.h
@@ -0,0 +1,227 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <typeindex>
+#include <vector>
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+
+namespace framework {
+
+class LoDTensor;
+
+class Tensor {
+ public:
+  template <typename T, size_t D, int MajorType, typename IndexType>
+  friend struct EigenTensor;
+
+  template <typename T, int MajorType, typename IndexType>
+  friend struct EigenMatrix;
+
+  template <typename T, int MajorType, typename IndexType>
+  friend struct EigenVector;
+
+ public:
+  Tensor() : offset_(0) {}
+
+  /*! Constructor with place should only be used in pybind. */
+  explicit Tensor(const platform::Place& place) : offset_(0) {
+    holder_->set_place(place);
+  }
+
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T* data();
+
+  /*! Return a pointer to constant memory block. */
+  template <typename T>
+  inline const T* data() const;
+
+  inline bool IsInitialized() const;
+
+  inline void switch_place(platform::Place new_place);
+
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+
+  inline void* mutable_data(platform::Place place, std::type_index type);
+
+  inline void* mutable_data(platform::Place place);
+
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+  /*! Return the numel of the memory block. */
+  inline int64_t numel() const;
+
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor& Resize(const DDim& dims);
+
+  /*! The internal of two tensors share the same memory block. */
+  inline Tensor& ShareDataWith(const Tensor& src);
+
+  /**
+   * @brief  Return a sub-tensor of the given tensor.
+   *
+   * @param[in] begin_idx   The index of the start row(inclusive) to slice.
+   *                        The index number begins from 0.
+   * @param[in] end_idx     The index of the end row(exclusive) to slice.
+   *                        The index number begins from 0.
+   */
+  inline Tensor Slice(int begin_idx, int end_idx) const;
+
+  platform::Place place() const {
+    PADDLE_ENFORCE_NOT_NULL(
+        holder_, "Tensor not initialized yet when Tensor::place() is called.");
+    return holder_->place();
+  }
+
+  std::type_index type() const {
+    PADDLE_ENFORCE_NOT_NULL(
+        holder_, "Tensor not initialized yet when Tensor::type() is called.");
+    return holder_->type();
+  }
+
+  // memory size returns the holding memory size in byte.
+  size_t memory_size() const;
+
+  inline void check_memory_size() const;
+
+  inline DataLayout layout() const { return layout_; }
+
+  inline void set_layout(const DataLayout layout) { layout_ = layout; }
+
+ private:
+  friend class LoDTensor;
+
+  /**
+   * @note    Placeholder hides type T, so it doesn't appear as a template
+   *          parameter of Variable.
+   */
+  struct Placeholder {
+    virtual ~Placeholder() = default;
+    virtual void* ptr() const = 0;
+    virtual size_t size() const = 0;
+    virtual std::type_index type() const = 0;
+    virtual platform::Place place() const = 0;
+    virtual void set_type(std::type_index type) = 0;
+    virtual void set_place(platform::Place place) = 0;
+  };
+
+  template <typename Place>
+  struct PlaceholderImpl : public Placeholder {
+    PlaceholderImpl(Place place, size_t size, std::type_index type)
+        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
+               memory::PODDeleter<uint8_t, Place>(place)),
+          place_(place),
+          size_(size),
+          type_(type) {
+      PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
+                              (is_cpu_place(place_) ? "CPU" : "GPU"));
+    }
+
+    virtual size_t size() const { return size_; }
+    virtual platform::Place place() const { return place_; }
+    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
+    virtual std::type_index type() const { return type_; }
+    virtual void set_type(std::type_index type) { type_ = type; }
+    virtual void set_place(platform::Place place) { place_ = place; }
+
+    /*! the pointer of memory block. */
+    std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
+
+    /*! the place of memory block. */
+    platform::Place place_;
+
+    /*! the size of memory block. */
+    size_t size_;
+
+    /* the current type of memory */
+    std::type_index type_;
+  };
+
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /**
+   * @brief points to elements dimensions.
+   *
+   * @note dims_ do not indicate the memory block size.
+   */
+
+  DDim dims_;
+
+  /**
+   * @brief the layout of memory block, default is NHWC.
+   *
+   * @note the memory allocation order, describe how weight/data is stored
+   *       For example, in 4-D Tensor(rank=4), there are three commonly
+   *       used layout. They are
+   *            NCHW, NHWC, CHWN.
+   *       N,C,H,W for respectively the batch size, the number of
+   *       feature maps, the height.
+   */
+
+  DataLayout layout_ = DataLayout::kNHWC;
+
+  /**
+   * @brief   A PlaceHolder may be shared by more than one tensor.
+   *
+   * @note    Some of them may be slices of the others. So the offset_
+   *          is introduced here to indicate the byte offset between
+   *          PlaceHolder::ptr_ and where the tensor data really begins.
+   */
+  size_t offset_;
+};
+
+inline void Tensor::switch_place(platform::Place new_place) {
+  if (holder_->place() == new_place) {
+    return;
+  }
+
+  // TODO(tonyyang-svail): do memcpy here.
+  PADDLE_THROW("Not Implemented");
+}
+
+}  // namespace framework
+}  // namespace paddle
+
+#include "paddle/fluid/framework/tensor_impl.h"
diff --git a/paddle/framework/tensor.md b/paddle/fluid/framework/tensor.md
similarity index 100%
rename from paddle/framework/tensor.md
rename to paddle/fluid/framework/tensor.md
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..e69836292cd0f4ed99e87ee8e297021dac43b64f
--- /dev/null
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -0,0 +1,196 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename... T>
+struct SizeOfTypeFunctor;
+
+template <typename T>
+struct SizeOfTypeFunctor<T> {
+  size_t operator()(std::type_index type) const {
+    if (typeid(T).hash_code() == type.hash_code()) {
+      return sizeof(T);
+    } else {
+      return 0UL;
+    }
+  }
+};
+
+template <>
+struct SizeOfTypeFunctor<> {
+  size_t operator()(std::type_index type) const { return 0UL; }
+};
+
+template <typename HEAD, typename... TAIL>
+struct SizeOfTypeFunctor<HEAD, TAIL...> {
+  size_t operator()(std::type_index type) const {
+    SizeOfTypeFunctor<HEAD> head;
+    size_t head_size = head(type);
+    if (head_size != 0) {
+      return head_size;
+    }
+    SizeOfTypeFunctor<TAIL...> tail;
+    return tail(type);
+  }
+};
+
+static inline size_t SizeOfType(std::type_index type) {
+  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t> functor;
+  size_t size = functor(type);
+  PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
+  return size;
+}
+
+inline void Tensor::check_memory_size() const {
+  PADDLE_ENFORCE_NOT_NULL(
+      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
+  PADDLE_ENFORCE_LE(
+      numel() * SizeOfType(type()), memory_size(),
+      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+      "first to re-allocate memory.\n"
+      "or maybe the required data-type mismatches the data already stored.");
+}
+
+inline size_t Tensor::memory_size() const {
+  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
+}
+
+template <typename T>
+inline const T* Tensor::data() const {
+  check_memory_size();
+  PADDLE_ENFORCE(std::is_same<T, void>::value ||
+                     holder_->type().hash_code() == typeid(T).hash_code(),
+                 "Tensor holds the wrong type, it holds %s",
+                 this->holder_->type().name());
+
+  return reinterpret_cast<const T*>(
+      reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
+}
+
+inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
+
+template <typename T>
+inline T* Tensor::data() {
+  check_memory_size();
+  PADDLE_ENFORCE(std::is_same<T, void>::value ||
+                     holder_->type().hash_code() == typeid(T).hash_code(),
+                 "Tensor holds the wrong type, it holds %s",
+                 this->holder_->type().name());
+  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                              offset_);
+}
+
+template <typename T>
+inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+  Resize(dims);
+  return mutable_data<T>(place);
+}
+
+template <typename T>
+inline T* Tensor::mutable_data(platform::Place place) {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+  return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
+}
+
+inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
+  if (holder_ != nullptr) {
+    holder_->set_type(type);
+  }
+  PADDLE_ENFORCE_GT(
+      numel(), 0,
+      "When calling this method, the Tensor's numel must be larger than zero. "
+      "Please check Tensor::Resize has been called first.");
+  int64_t size = numel() * SizeOfType(type);
+  /* some versions of boost::variant don't have operator!= */
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + offset_) {
+    if (platform::is_cpu_place(place)) {
+      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
+          boost::get<platform::CPUPlace>(place), size, type));
+    } else if (platform::is_gpu_place(place)) {
+#ifndef PADDLE_WITH_CUDA
+      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+    }
+#else
+      holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
+          boost::get<platform::CUDAPlace>(place), size, type));
+    }
+#endif
+    offset_ = 0;
+  }
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                 offset_);
+}
+
+inline void* Tensor::mutable_data(platform::Place place) {
+  PADDLE_ENFORCE(this->holder_ != nullptr,
+                 "Cannot invoke mutable data if current hold nothing");
+  return mutable_data(place, holder_->type());
+}
+
+inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
+  src.check_memory_size();
+  *this = src;
+  return *this;
+}
+
+inline Tensor Tensor::Slice(int begin_idx, int end_idx) const {
+  check_memory_size();
+  PADDLE_ENFORCE_GE(begin_idx, 0,
+                    "The start row index must be greater than 0.");
+  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
+  PADDLE_ENFORCE_LT(
+      begin_idx, end_idx,
+      "The start row index must be lesser than the end row index.");
+
+  if (dims_[0] == 1) {
+    return *this;
+  } else {
+    size_t base = numel() / dims_[0];
+    Tensor dst;
+    dst.holder_ = holder_;
+    dst.set_layout(layout_);
+    DDim dst_dims = dims_;
+    dst_dims[0] = end_idx - begin_idx;
+    dst.Resize(dst_dims);
+    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
+    return dst;
+  }
+}
+
+inline Tensor& Tensor::Resize(const DDim& dims) {
+  dims_ = dims;
+  return *this;
+}
+
+inline const DDim& Tensor::dims() const { return dims_; }
+
+inline int64_t Tensor::numel() const { return product(dims_); }
+
+inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
+  Tensor res;
+  res.ShareDataWith(src);
+  res.Resize(flatten_to_2d(src.dims(), num_col_dims));
+  return res;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6ed416e46f99f4d2ed50538a3e2c090ed8dd6fc3
--- /dev/null
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -0,0 +1,215 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/tensor.h"
+#include <gtest/gtest.h>
+#include <string>
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+
+TEST(Tensor, Dims) {
+  framework::Tensor tt;
+  tt.Resize({2, 3, 4});
+  framework::DDim dims = tt.dims();
+  ASSERT_EQ(arity(dims), 3);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(i + 2, dims[i]);
+  }
+}
+
+TEST(Tensor, DataAssert) {
+  framework::Tensor src_tensor;
+
+  bool caught = false;
+  try {
+    src_tensor.data<double>();
+  } catch (platform::EnforceNotMet err) {
+    caught = true;
+    std::string msg =
+        "holder_ should not be null\nTensor holds no memory. Call "
+        "Tensor::mutable_data first.";
+    const char* what = err.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(what[i], msg[i]);
+    }
+  }
+  ASSERT_TRUE(caught);
+}
+
+TEST(Tensor, MutableData) {
+  {
+    framework::Tensor src_tensor;
+    float* p1 = nullptr;
+    float* p2 = nullptr;
+    // initialization
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
+                                        platform::CPUPlace());
+    EXPECT_NE(p1, nullptr);
+    // set src_tensor a new dim with large size
+    // momery is supposed to be re-allocated
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 4}),
+                                        platform::CPUPlace());
+    EXPECT_NE(p2, nullptr);
+    EXPECT_NE(p1, p2);
+    // set src_tensor a new dim with same size
+    // momery block is supposed to be unchanged
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
+                                        platform::CPUPlace());
+    EXPECT_EQ(p1, p2);
+    // set src_tensor a new dim with smaller size
+    // momery block is supposed to be unchanged
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
+                                        platform::CPUPlace());
+    EXPECT_EQ(p1, p2);
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    framework::Tensor src_tensor;
+    float* p1 = nullptr;
+    float* p2 = nullptr;
+    // initialization
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
+                                        platform::CUDAPlace());
+    EXPECT_NE(p1, nullptr);
+    // set src_tensor a new dim with large size
+    // momery is supposed to be re-allocated
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 4}),
+                                        platform::CUDAPlace());
+    EXPECT_NE(p2, nullptr);
+    EXPECT_NE(p1, p2);
+    // set src_tensor a new dim with same size
+    // momery block is supposed to be unchanged
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
+                                        platform::CUDAPlace());
+    EXPECT_EQ(p1, p2);
+    // set src_tensor a new dim with smaller size
+    // momery block is supposed to be unchanged
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
+                                        platform::CUDAPlace());
+    EXPECT_EQ(p1, p2);
+  }
+#endif
+}
+
+TEST(Tensor, ShareDataWith) {
+  {
+    framework::Tensor src_tensor;
+    framework::Tensor dst_tensor;
+    // Try to share data form uninitialized tensor
+    bool caught = false;
+    try {
+      dst_tensor.ShareDataWith(src_tensor);
+    } catch (paddle::platform::EnforceNotMet err) {
+      caught = true;
+      std::string msg =
+          "holder_ should not be null\nTensor holds no memory. Call "
+          "Tensor::mutable_data first.";
+      const char* what = err.what();
+      for (size_t i = 0; i < msg.length(); ++i) {
+        ASSERT_EQ(what[i], msg[i]);
+      }
+    }
+    ASSERT_TRUE(caught);
+
+    src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
+                                 platform::CPUPlace());
+    dst_tensor.ShareDataWith(src_tensor);
+    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    framework::Tensor src_tensor;
+    framework::Tensor dst_tensor;
+    src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
+                                 platform::CUDAPlace());
+    dst_tensor.ShareDataWith(src_tensor);
+    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+  }
+#endif
+}
+
+TEST(Tensor, Slice) {
+  {
+    framework::Tensor src_tensor;
+    src_tensor.mutable_data<int>(framework::make_ddim({5, 3, 4}),
+                                 platform::CPUPlace());
+    framework::Tensor slice_tensor = src_tensor.Slice(1, 3);
+    framework::DDim slice_dims = slice_tensor.dims();
+    ASSERT_EQ(arity(slice_dims), 3);
+    EXPECT_EQ(slice_dims[0], 2);
+    EXPECT_EQ(slice_dims[1], 3);
+    EXPECT_EQ(slice_dims[2], 4);
+
+    uintptr_t src_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.data<int>());
+    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
+        src_tensor.mutable_data<int>(src_tensor.dims(), platform::CPUPlace()));
+    uintptr_t slice_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.data<int>());
+    uintptr_t slice_mutable_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<int>(
+            slice_tensor.dims(), platform::CPUPlace()));
+    EXPECT_EQ(src_data_address, src_mutable_data_address);
+    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
+    EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    framework::Tensor src_tensor;
+    src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
+                                    platform::CUDAPlace());
+    framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
+    framework::DDim slice_dims = slice_tensor.dims();
+    ASSERT_EQ(arity(slice_dims), 2);
+    EXPECT_EQ(slice_dims[0], 4);
+    EXPECT_EQ(slice_dims[1], 9);
+
+    uintptr_t src_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
+    uintptr_t src_mutable_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
+            src_tensor.dims(), platform::CUDAPlace()));
+    uintptr_t slice_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
+    uintptr_t slice_mutable_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
+            slice_tensor.dims(), platform::CUDAPlace()));
+    EXPECT_EQ(src_data_address, src_mutable_data_address);
+    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
+    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
+  }
+#endif
+}
+
+TEST(Tensor, ReshapeToMatrix) {
+  framework::Tensor src;
+  int* src_ptr = src.mutable_data<int>({2, 3, 4, 9}, platform::CPUPlace());
+  for (int i = 0; i < 2 * 3 * 4 * 9; ++i) {
+    src_ptr[i] = i;
+  }
+  framework::Tensor res = framework::ReshapeToMatrix(src, 2);
+  ASSERT_EQ(res.dims()[0], 2 * 3);
+  ASSERT_EQ(res.dims()[1], 4 * 9);
+}
+
+TEST(Tensor, Layout) {
+  framework::Tensor src;
+  ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC);
+  src.set_layout(framework::DataLayout::kAnyLayout);
+  ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
+}
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..537fb4614cac8bc898b277899f803a3b1846a00e
--- /dev/null
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace paddle {
+namespace framework {
+template <typename Predicate, typename DevCtx>
+struct AnyDTypeVisitor {
+  Predicate predicate_;
+  const Tensor& tensor_;
+  const DevCtx& ctx_;
+  Tensor* out_;
+
+  AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx,
+                  Tensor* out)
+      : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
+
+  template <typename T>
+  void operator()() const {
+    auto t = EigenVector<T>::Flatten(tensor_);
+    auto o = EigenScalar<bool>::From(*out_);
+    // return any of predicate_(t) is true.
+    o.device(*ctx_.eigen_device()) = predicate_(t).any();
+  }
+};
+
+template <typename Predicate, typename DevCtx>
+inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
+                    const DevCtx& ctx, framework::Tensor* out) {
+  VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor<Predicate, DevCtx>(
+                                               predicate, tensor, ctx, out));
+}
+
+template <typename Predicate>
+struct AnyVisitor : public boost::static_visitor<bool> {
+  const framework::Tensor& tensor_;
+  Predicate predicate_;
+
+  AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
+      : tensor_(tensor), predicate_(std::move(predicate)) {}
+
+  template <typename Place>
+  bool operator()(const Place& place) const {
+    framework::Tensor out;
+    out.Resize({1});
+    out.mutable_data<bool>(place);
+    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+    AnyImpl(predicate_, tensor_, *ctx, &out);
+    return this->GetResult(out, place);
+  }
+
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CUDAPlace& gpu) const {
+    platform::CPUPlace cpu;
+    framework::Tensor tmp;
+    tmp.Resize({1});
+    tmp.mutable_data<bool>(cpu);
+    auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
+    gpuctx->Wait();
+    Copy(out, cpu, *gpuctx, &tmp);
+    gpuctx->Wait();
+    return GetResult(tmp, cpu);
+  }
+
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CPUPlace& cpu) const {
+    return *out.data<bool>();
+  }
+};
+
+template <typename Predicate>
+inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
+  AnyVisitor<Predicate> visitor(tensor, predicate);
+  auto place = tensor.place();
+  return platform::VisitPlace(place, visitor);
+}
+
+struct HasNANPredicate {
+  template <typename T>
+  auto operator()(const T& eigen_vec) const
+      -> decltype(std::declval<T>().isnan()) {
+    // Cast eigen_vector to vector of bool. true if is inf.
+    return eigen_vec.isnan();
+  }
+};
+
+bool HasNAN(const framework::Tensor& tensor) {
+  HasNANPredicate predicate;
+  return Any(tensor, predicate);
+}
+
+struct HasInfPredicate {
+  template <typename T>
+  auto operator()(const T& eigen_vec) const
+      -> decltype(std::declval<T>().isinf()) {
+    // Cast eigen_vector to vector of bool. true if is inf.
+    return eigen_vec.isinf();
+  }
+};
+
+bool HasInf(const framework::Tensor& tensor) {
+  HasInfPredicate predicate;
+  return Any(tensor, predicate);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/tensor_util.cu b/paddle/fluid/framework/tensor_util.cu
new file mode 100644
index 0000000000000000000000000000000000000000..537fb4614cac8bc898b277899f803a3b1846a00e
--- /dev/null
+++ b/paddle/fluid/framework/tensor_util.cu
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace paddle {
+namespace framework {
+template <typename Predicate, typename DevCtx>
+struct AnyDTypeVisitor {
+  Predicate predicate_;
+  const Tensor& tensor_;
+  const DevCtx& ctx_;
+  Tensor* out_;
+
+  AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx,
+                  Tensor* out)
+      : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
+
+  template <typename T>
+  void operator()() const {
+    auto t = EigenVector<T>::Flatten(tensor_);
+    auto o = EigenScalar<bool>::From(*out_);
+    // return any of predicate_(t) is true.
+    o.device(*ctx_.eigen_device()) = predicate_(t).any();
+  }
+};
+
+template <typename Predicate, typename DevCtx>
+inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
+                    const DevCtx& ctx, framework::Tensor* out) {
+  VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor<Predicate, DevCtx>(
+                                               predicate, tensor, ctx, out));
+}
+
+template <typename Predicate>
+struct AnyVisitor : public boost::static_visitor<bool> {
+  const framework::Tensor& tensor_;
+  Predicate predicate_;
+
+  AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
+      : tensor_(tensor), predicate_(std::move(predicate)) {}
+
+  template <typename Place>
+  bool operator()(const Place& place) const {
+    framework::Tensor out;
+    out.Resize({1});
+    out.mutable_data<bool>(place);
+    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+    AnyImpl(predicate_, tensor_, *ctx, &out);
+    return this->GetResult(out, place);
+  }
+
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CUDAPlace& gpu) const {
+    platform::CPUPlace cpu;
+    framework::Tensor tmp;
+    tmp.Resize({1});
+    tmp.mutable_data<bool>(cpu);
+    auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
+    gpuctx->Wait();
+    Copy(out, cpu, *gpuctx, &tmp);
+    gpuctx->Wait();
+    return GetResult(tmp, cpu);
+  }
+
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CPUPlace& cpu) const {
+    return *out.data<bool>();
+  }
+};
+
+template <typename Predicate>
+inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
+  AnyVisitor<Predicate> visitor(tensor, predicate);
+  auto place = tensor.place();
+  return platform::VisitPlace(place, visitor);
+}
+
+struct HasNANPredicate {
+  template <typename T>
+  auto operator()(const T& eigen_vec) const
+      -> decltype(std::declval<T>().isnan()) {
+    // Cast eigen_vector to vector of bool. true if is inf.
+    return eigen_vec.isnan();
+  }
+};
+
+bool HasNAN(const framework::Tensor& tensor) {
+  HasNANPredicate predicate;
+  return Any(tensor, predicate);
+}
+
+struct HasInfPredicate {
+  template <typename T>
+  auto operator()(const T& eigen_vec) const
+      -> decltype(std::declval<T>().isinf()) {
+    // Cast eigen_vector to vector of bool. true if is inf.
+    return eigen_vec.isinf();
+  }
+};
+
+bool HasInf(const framework::Tensor& tensor) {
+  HasInfPredicate predicate;
+  return Any(tensor, predicate);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7e772b6daad93dc915665d58d4a5722c74c0d2b
--- /dev/null
+++ b/paddle/fluid/framework/tensor_util.h
@@ -0,0 +1,333 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+/**
+ * @brief   Copy the content of external tensor to a new place.
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] dst_place  The dst place.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * @note    Copy supports CPU <-> GPU, GPU <-> GPU.
+ */
+inline void Copy(const Tensor& src, const platform::Place& dst_place,
+                 const platform::DeviceContext& ctx, Tensor* dst) {
+  VLOG(3) << "Copy " << src.dims() << " from " << src.place() << " to "
+          << dst_place;
+  src.check_memory_size();
+
+  dst->Resize(src.dims());
+  dst->set_layout(src.layout());
+  auto src_place = src.place();
+  auto src_ptr = src.data<void>();
+
+  auto dst_ptr = dst->mutable_data(dst_place, src.type());
+
+  auto size = src.numel() * SizeOfType(src.type());
+
+  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
+    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  } else if (platform::is_gpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+/**
+ * @brief Wrapper on
+ *     Copy(const Tensor& src, const platform::Place& dst_place,
+ *              const platform::DeviceContext& ctx, Tensor* dst);
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] dst_place  The dst place.
+ *
+ * @note    Copy supports CPU <-> GPU, GPU <-> GPU.
+ */
+inline void Copy(const Tensor& src, const platform::Place& dst_place,
+                 Tensor* dst) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext* dev_ctx;
+  if (platform::is_gpu_place(src.place())) {
+    dev_ctx = pool.Get(src.place());
+  } else {
+    dev_ctx = pool.Get(dst_place);
+  }
+  Copy(src, dst_place, *dev_ctx, dst);
+}
+
+/**
+ * @brief   Copy the content of an external vector to a tensor.
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * * @note    CopyFromVector will resize dst to an 1D tensor with the same
+ *            size as src.
+ */
+template <typename T>
+inline void CopyFromVector(const std::vector<T>& src,
+                           const platform::DeviceContext& ctx, Tensor* dst) {
+  auto dst_place = ctx.GetPlace();
+  auto src_ptr = static_cast<const void*>(src.data());
+  platform::CPUPlace src_place;
+  dst->Resize({static_cast<int64_t>(src.size())});
+  auto dst_ptr = static_cast<void*>(dst->mutable_data<T>(dst_place));
+  auto size = src.size() * sizeof(T);
+
+  if (platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, src_place,
+                 src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
+    memory::Copy(
+        boost::get<platform::CUDAPlace>(dst_place), dst_ptr, src_place, src_ptr,
+        size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+/**
+ * @brief CopyFromVector CPU vector -> CPU Tensor
+ */
+template <typename T>
+inline void CopyFromVector(const std::vector<T>& src, Tensor* dst) {
+  platform::CPUPlace dst_place = platform::CPUPlace();
+  auto src_ptr = static_cast<const void*>(src.data());
+  platform::CPUPlace src_place;
+  dst->Resize({static_cast<int64_t>(src.size())});
+  auto dst_ptr = static_cast<void*>(dst->mutable_data<T>(dst_place));
+  auto size = src.size() * sizeof(T);
+
+  memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+}
+
+/**
+ * @brief   Copy the content of a tensor to a vector
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * * @note    CopyFromVector assumes that the tensor has been resized
+ *            before invoking.
+ */
+template <typename T>
+inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
+                         std::vector<T>* dst) {
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+  auto size = src.numel() * sizeof(T);
+
+  platform::CPUPlace dst_place;
+  dst->resize(src.numel());
+  auto dst_ptr = static_cast<void*>(dst->data());
+
+  if (platform::is_cpu_place(src.place())) {
+    memory::Copy(dst_place, dst_ptr,
+                 boost::get<platform::CPUPlace>(src.place()), src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src.place())) {  // NOLINT
+    memory::Copy(
+        dst_place, dst_ptr, boost::get<platform::CUDAPlace>(src.place()),
+        src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+/**
+ * @brief CopyToVector CPUTensor <-> CPU Vector
+ */
+template <typename T>
+inline void CopyToVector(const Tensor& src, std::vector<T>* dst) {
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+  auto size = src.numel() * sizeof(T);
+
+  platform::CPUPlace dst_place;
+  dst->resize(src.numel());
+  auto dst_ptr = static_cast<void*>(dst->data());
+
+  PADDLE_ENFORCE(platform::is_cpu_place(src.place()));
+
+  memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()),
+               src_ptr, size);
+}
+
+// Returns true if a tensor contains NAN, i.e., Not A Number.
+bool HasNAN(const framework::Tensor& tensor);
+
+// Returns true if a tensor contains Inf, i.e., Infinity.
+bool HasInf(const framework::Tensor& tensor);
+
+inline void SerializeToStream(std::ostream& os, const Tensor& tensor,
+                              const platform::DeviceContext& dev_ctx) {
+  // TODO(typhoonzero): serialize to ostream
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
+  }
+  {  // the 2nd field, tensor description
+     // int32_t  size
+     // void*    protobuf message
+    proto::TensorDesc desc;
+    desc.set_data_type(framework::ToDataType(tensor.type()));
+    auto dims = framework::vectorize(tensor.dims());
+    auto* pb_dims = desc.mutable_dims();
+    pb_dims->Resize(static_cast<int>(dims.size()), 0);
+    std::copy(dims.begin(), dims.end(), pb_dims->begin());
+    int32_t size = desc.ByteSize();
+    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    auto out = desc.SerializeAsString();
+    os.write(out.data(), size);
+  }
+  {  // the 3rd field, tensor data
+    uint64_t size = tensor.memory_size();
+    auto* data_ptr = tensor.data<void>();
+    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
+                   "Index overflow when writing tensor");
+    if (platform::is_gpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& gpu_dev_ctx =
+          static_cast<const platform::CUDADeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     boost::get<platform::CUDAPlace>(tensor.place()),
+                     reinterpret_cast<const void*>(data), size_to_write,
+                     gpu_dev_ctx.stream());
+        gpu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      os.write(static_cast<const char*>(data_ptr),
+               static_cast<std::streamsize>(size));
+    }
+  }
+}
+
+struct DeserializedDataFunctor {
+  DeserializedDataFunctor(void** buf, Tensor* tensor,
+                          const platform::Place& place)
+      : buf_(buf), tensor_(tensor), place_(place) {}
+
+  template <typename T>
+  void operator()() {
+    *buf_ = tensor_->mutable_data<T>(place_);
+  }
+
+  void** buf_;
+  Tensor* tensor_;
+  platform::Place place_;
+};
+
+inline void DeserializeFromStream(std::istream& is, Tensor* tensor,
+                                  const platform::DeviceContext& dev_ctx) {
+  uint32_t version;
+  is.read(reinterpret_cast<char*>(&version), sizeof(version));
+  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  proto::TensorDesc desc;
+  {  // int32_t size
+     // proto buffer
+    int32_t size;
+    is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    std::unique_ptr<char[]> buf(new char[size]);
+    is.read(reinterpret_cast<char*>(buf.get()), size);
+    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
+                   "Cannot parse tensor desc");
+  }
+  {  // read tensor
+    std::vector<int64_t> dims;
+    dims.reserve(static_cast<size_t>(desc.dims().size()));
+    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+    tensor->Resize(framework::make_ddim(dims));
+    void* buf;
+    auto ctx = platform::CPUDeviceContext();
+    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+      Tensor cpu_tensor;
+      cpu_tensor.Resize(framework::make_ddim(dims));
+      framework::VisitDataType(
+          desc.data_type(),
+          DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
+      is.read(static_cast<char*>(buf), cpu_tensor.memory_size());
+      auto dst_place = dev_ctx.GetPlace();
+      framework::Copy(cpu_tensor, dst_place, dev_ctx, tensor);
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      framework::VisitDataType(
+          desc.data_type(),
+          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
+      is.read(static_cast<char*>(buf), tensor->memory_size());
+    }
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8764c692e875328fc98a7b67a018014af487f394
--- /dev/null
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -0,0 +1,309 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include <string>
+
+namespace paddle {
+namespace framework {
+
+TEST(Copy, Tensor) {
+  Tensor src_tensor;
+  Tensor dst_tensor;
+  platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
+
+  int* src_ptr =
+      src_tensor.mutable_data<int>(make_ddim({3, 3}), platform::CPUPlace());
+
+  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  memcpy(src_ptr, arr, 9 * sizeof(int));
+  src_tensor.set_layout(DataLayout::kAnyLayout);
+
+  auto cpu_place = new platform::CPUPlace();
+  Copy(src_tensor, *cpu_place, &dst_tensor);
+
+  const int* dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
+
+  EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
+
+  Tensor slice_tensor = src_tensor.Slice(1, 2);
+  Copy(slice_tensor, *cpu_place, &dst_tensor);
+  const int* slice_ptr = slice_tensor.data<int>();
+  dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(dst_ptr, slice_ptr);
+  for (size_t i = 0; i < 3; ++i) {
+    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+  }
+  EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    Tensor src_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+
+    int* src_ptr =
+        src_tensor.mutable_data<int>(make_ddim({3, 3}), platform::CPUPlace());
+
+    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    memcpy(src_ptr, arr, 9 * sizeof(int));
+
+    // CPU Tensor to GPU Tensor
+    auto gpu_place = new platform::CUDAPlace(0);
+    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+
+    // GPU Tensor to CPU Tensor
+    auto cpu_place = new platform::CPUPlace();
+    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    Tensor slice_tensor = src_tensor.Slice(1, 2);
+
+    // CPU Slice Tensor to GPU Tensor
+    Copy(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+
+    // GPU Tensor to CPU Tensor
+    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Slice Tensors
+    gpu_ctx.Wait();
+    const int* slice_ptr = slice_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(dst_ptr, slice_ptr);
+    for (size_t i = 0; i < 3; ++i) {
+      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    }
+
+    EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
+  }
+#endif
+}
+
+TEST(CopyFromVector, Tensor) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor cpu_tensor;
+
+    // Copy to CPU Tensor
+    cpu_tensor.Resize(make_ddim({3, 3}));
+    auto cpu_place = new paddle::platform::CPUPlace();
+    CopyFromVector<int>(src_vec, &cpu_tensor);
+
+    // Compare Tensors
+    const int* cpu_ptr = cpu_tensor.data<int>();
+    const int* src_ptr = src_vec.data();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+    }
+
+    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
+    cpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, &cpu_tensor);
+    cpu_ptr = cpu_tensor.data<int>();
+    src_ptr = src_vec.data();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    for (size_t i = 0; i < 5; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+    }
+
+    delete cpu_place;
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor cpu_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+
+    // Copy to CPU Tensor
+    cpu_tensor.Resize(make_ddim({3, 3}));
+    auto cpu_place = new paddle::platform::CPUPlace();
+    CPUDeviceContext cpu_ctx(*cpu_place);
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+
+    // Copy to GPUTensor
+    gpu_tensor.Resize(make_ddim({3, 3}));
+    auto gpu_place = new paddle::platform::CUDAPlace();
+    CUDADeviceContext gpu_ctx(*gpu_place);
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    // Copy from GPU to CPU tensor for comparison
+    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    const int* src_ptr = src_vec.data();
+    const int* cpu_ptr = cpu_tensor.data<int>();
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
+
+    cpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    gpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    src_ptr = src_vec.data();
+    cpu_ptr = cpu_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 5; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    delete cpu_place;
+    delete gpu_place;
+  }
+#endif
+}
+
+TEST(CopyToVector, Tensor) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    Tensor src;
+    int* src_ptr = src.mutable_data<int>({3, 3}, CPUPlace());
+    for (int i = 0; i < 3 * 3; ++i) {
+      src_ptr[i] = i;
+    }
+
+    CPUPlace place;
+    std::vector<int> dst;
+    CopyToVector<int>(src, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_ptr[i], dst[i]);
+    }
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor gpu_tensor;
+    CUDAPlace place;
+    CUDADeviceContext gpu_ctx(place);
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+
+    std::vector<int> dst;
+    CopyToVector<int>(gpu_tensor, gpu_ctx, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_vec[i], dst[i]);
+    }
+  }
+#endif
+}
+
+TEST(HasNAN, CPU) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor src;
+  float* buf = src.mutable_data<float>({3}, CPUPlace());
+  buf[0] = 0.0;
+  buf[1] = NAN;
+  buf[2] = 0.0;
+
+  ASSERT_TRUE(HasNAN(src));
+}
+
+TEST(HasInf, CPU) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor src;
+  double* buf = src.mutable_data<double>({3}, CPUPlace());
+  buf[0] = 1.0;
+  buf[1] = INFINITY;
+  buf[2] = 0.0;
+  ASSERT_TRUE(HasInf(src));
+}
+
+TEST(Tensor, SerializeAndDeserialize) {
+  framework::Tensor src_tensor;
+  int array[6] = {1, 2, 3, 4, 5, 6};
+  src_tensor.Resize({2, 3});
+  int* src_ptr = src_tensor.mutable_data<int>(platform::CPUPlace());
+  for (int i = 0; i < 6; ++i) {
+    src_ptr[i] = array[i];
+  }
+  {
+    framework::Tensor dst_tensor;
+    auto place = new platform::CPUPlace();
+    platform::CPUDeviceContext cpu_ctx(*place);
+    std::ostringstream oss;
+    SerializeToStream(oss, src_tensor, cpu_ctx);
+
+    std::istringstream iss(oss.str());
+    DeserializeFromStream(iss, &dst_tensor, cpu_ctx);
+    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
+    for (int i = 0; i < 5; ++i) {
+      ASSERT_EQ(dst_ptr[i], array[i]);
+    }
+    ASSERT_EQ(dst_tensor.dims(), src_tensor.dims());
+    delete place;
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    Tensor gpu_tensor;
+    gpu_tensor.Resize({2, 3});
+    Tensor dst_tensor;
+
+    auto gpu_place = new platform::CUDAPlace();
+    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+
+    Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+
+    std::ostringstream oss;
+    SerializeToStream(oss, gpu_tensor, gpu_ctx);
+
+    std::istringstream iss(oss.str());
+    DeserializeFromStream(iss, &dst_tensor, gpu_ctx);
+
+    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
+    for (int i = 0; i < 6; ++i) {
+      ASSERT_EQ(dst_ptr[i], array[i]);
+    }
+    delete gpu_place;
+  }
+#endif
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/tensor_util_test.cu b/paddle/fluid/framework/tensor_util_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1982b642bcd1f2e21a684b701d7bd603f0c2c894
--- /dev/null
+++ b/paddle/fluid/framework/tensor_util_test.cu
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+
+static __global__ void FillNAN(float* buf) {
+  buf[0] = 0.0;
+  buf[1] = 0.1;
+  buf[2] = NAN;
+}
+static __global__ void FillInf(float* buf) {
+  buf[0] = 0.0;
+  buf[1] = INFINITY;
+  buf[2] = 0.5;
+}
+
+TEST(HasNAN, GPU) {
+  Tensor tensor;
+  platform::CUDAPlace gpu(0);
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  float* buf = tensor.mutable_data<float>({3}, gpu);
+  FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+  cuda_ctx->Wait();
+  ASSERT_TRUE(HasNAN(tensor));
+}
+
+TEST(HasInf, GPU) {
+  Tensor tensor;
+  platform::CUDAPlace gpu(0);
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  float* buf = tensor.mutable_data<float>({3}, gpu);
+  FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+  cuda_ctx->Wait();
+  ASSERT_TRUE(HasInf(tensor));
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c4de41b0c41fa3eeaf6c77def9c728dd9976895
--- /dev/null
+++ b/paddle/fluid/framework/threadpool.cc
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/framework/threadpool.h"
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+std::unique_ptr<ThreadPool> ThreadPool::threadpool_(nullptr);
+std::once_flag ThreadPool::init_flag_;
+
+ThreadPool* ThreadPool::GetInstance() {
+  std::call_once(init_flag_, &ThreadPool::Init);
+  return threadpool_.get();
+}
+
+void ThreadPool::Init() {
+  if (threadpool_.get() == nullptr) {
+    // TODO(Yancey1989): specify the max threads number
+    int num_threads = std::thread::hardware_concurrency();
+    PADDLE_ENFORCE_GT(num_threads, 0);
+    threadpool_.reset(new ThreadPool(num_threads));
+  }
+}
+
+ThreadPool::ThreadPool(int num_threads)
+    : total_threads_(num_threads), idle_threads_(num_threads), running_(true) {
+  threads_.resize(num_threads);
+  for (auto& thread : threads_) {
+    // TODO(Yancey1989): binding the thread on the specify CPU number
+    thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
+  }
+}
+
+ThreadPool::~ThreadPool() {
+  {
+    // notify all threads to stop running
+    running_ = false;
+    scheduled_.notify_all();
+  }
+
+  for (auto& t : threads_) {
+    t->join();
+    t.reset(nullptr);
+  }
+}
+
+void ThreadPool::Wait() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  completed_.wait(lock, [=] { return Done() == true; });
+}
+
+void ThreadPool::TaskLoop() {
+  while (running_) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
+
+    if (!running_) {
+      break;
+    }
+    // pop a task from the task queue
+    auto task = std::move(tasks_.front());
+    tasks_.pop();
+
+    --idle_threads_;
+    lock.unlock();
+
+    // run the task
+    task();
+
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      ++idle_threads_;
+      if (Done()) {
+        completed_.notify_all();
+      }
+    }
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h
new file mode 100644
index 0000000000000000000000000000000000000000..e88e6c01f02deb77278a02ba81ee62ddfcf42eb8
--- /dev/null
+++ b/paddle/fluid/framework/threadpool.h
@@ -0,0 +1,147 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+
+namespace paddle {
+namespace framework {
+
+// ThreadPool maintains a queue of tasks, and runs them using a fixed
+// number of threads.
+class ThreadPool {
+ public:
+  using Task = std::packaged_task<std::unique_ptr<platform::EnforceNotMet>()>;
+
+  // Returns the singleton of ThreadPool.
+  static ThreadPool* GetInstance();
+
+  ~ThreadPool();
+
+  // Returns the number of threads created by the constructor.
+  size_t Threads() const { return total_threads_; }
+
+  // Returns the number of currently idle threads.
+  size_t IdleThreads() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return idle_threads_;
+  }
+
+  // Run pushes a function to the task queue and returns a std::future
+  // object.  To wait for the completion of the task, call
+  // std::future::wait().
+  template <typename Callback>
+  std::future<void> Run(Callback fn) {
+    auto f = this->RunAndGetException(fn);
+    return std::async(std::launch::deferred, ExceptionHandler(std::move(f)));
+  }
+
+  template <typename Callback>
+  std::future<std::unique_ptr<platform::EnforceNotMet>> RunAndGetException(
+      Callback fn) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    Task task([fn]() -> std::unique_ptr<platform::EnforceNotMet> {
+      try {
+        fn();
+        return nullptr;
+      } catch (platform::EnforceNotMet ex) {
+        return std::unique_ptr<platform::EnforceNotMet>(
+            new platform::EnforceNotMet(ex));
+      } catch (...) {
+        LOG(FATAL)
+            << "Unexpected exception is catched in thread pool. All "
+               "throwable exception in Fluid should be an EnforceNotMet.";
+      }
+    });
+    std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
+    tasks_.push(std::move(task));
+    lock.unlock();
+    scheduled_.notify_one();
+    return f;
+  }
+
+  // Wait until all the tasks are completed.
+  void Wait();
+
+ private:
+  struct ExceptionHandler {
+    mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
+    explicit ExceptionHandler(
+        std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
+        : future_(std::move(f)) {}
+    void operator()() const {
+      auto ex = this->future_.get();
+      if (ex != nullptr) {
+        LOG(FATAL) << "The exception is thrown inside the thread pool. You "
+                      "should use RunAndGetException to handle the exception.\n"
+                      "The default exception handler is LOG(FATAL)."
+                   << ex->what();
+      }
+    }
+  };
+
+  DISABLE_COPY_AND_ASSIGN(ThreadPool);
+
+  explicit ThreadPool(int num_threads);
+
+  // If the task queue is empty and avaialbe is equal to the number of
+  // threads, means that all tasks are completed.  Note: this function
+  // is not thread-safe.  Returns true if all tasks are completed.
+  // Note: don't delete the data member total_threads_ and use
+  // threads_.size() instead; because you'd need to lock the mutex
+  // before accessing threads_.
+  bool Done() { return tasks_.empty() && idle_threads_ == total_threads_; }
+
+  // The constructor starts threads to run TaskLoop, which retrieves
+  // and runs tasks from the queue.
+  void TaskLoop();
+
+  // Init is called by GetInstance.
+  static void Init();
+
+ private:
+  static std::unique_ptr<ThreadPool> threadpool_;
+  static std::once_flag init_flag_;
+
+  std::vector<std::unique_ptr<std::thread>> threads_;
+  const size_t total_threads_;
+  size_t idle_threads_;
+
+  std::queue<Task> tasks_;
+  std::mutex mutex_;
+  bool running_;
+  std::condition_variable scheduled_;
+  std::condition_variable completed_;
+};
+
+// Run a function asynchronously.
+// NOTE: The function must return void. If the function need to return a value,
+// you can use lambda to capture a value pointer.
+template <typename Callback>
+std::future<void> Async(Callback callback) {
+  return ThreadPool::GetInstance()->Run(callback);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc
similarity index 100%
rename from paddle/framework/threadpool_test.cc
rename to paddle/fluid/framework/threadpool_test.cc
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
new file mode 100644
index 0000000000000000000000000000000000000000..786d78a6440de60abea44f7b8fccb90d455b488c
--- /dev/null
+++ b/paddle/fluid/framework/type_defs.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+class OperatorBase;
+class OpDesc;
+class InferShapeContext;
+class BlockDesc;
+
+using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+
+// The order should be as same as framework.proto
+using Attribute =
+    boost::variant<boost::blank, int, float, std::string, std::vector<int>,
+                   std::vector<float>, std::vector<std::string>, bool,
+                   std::vector<bool>, BlockDesc*, int64_t>;
+
+using AttributeMap = std::unordered_map<std::string, Attribute>;
+
+using OpCreator = std::function<OperatorBase*(
+    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
+    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
+
+using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDesc>>(
+    const OpDesc&, const std::unordered_set<std::string>& /*no_grad_set*/,
+    std::unordered_map<std::string, std::string>* /*grad_to_var*/,
+    const std::vector<BlockDesc*>& grad_block)>;
+
+using InferVarTypeFN =
+    std::function<void(const OpDesc& /*op_desc*/, BlockDesc* /*block*/)>;
+
+using InferShapeFN = std::function<void(InferShapeContext*)>;
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7ec9b2ced94c4176b64996827dcb79f1d756be6b
--- /dev/null
+++ b/paddle/fluid/framework/var_desc.cc
@@ -0,0 +1,259 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+proto::VarDesc::VarType VarDesc::GetType() const { return desc_.type(); }
+
+void VarDesc::SetType(proto::VarDesc::VarType type) { desc_.set_type(type); }
+
+void VarDesc::SetShape(const std::vector<int64_t> &dims) {
+  VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
+}
+
+void VarDesc::SetTensorDescNum(size_t num) {
+  switch (desc_.type()) {
+    case proto::VarDesc::READER: {
+      auto *lod_tensors_ptr = desc_.mutable_reader()->mutable_lod_tensor();
+      lod_tensors_ptr->Clear();
+      for (size_t i = 0; i < num; ++i) {
+        lod_tensors_ptr->Add();
+      }
+      return;
+    } break;
+    default:
+      PADDLE_THROW(
+          "Setting 'sub_tensor_number' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+size_t VarDesc::GetTensorDescNum() const {
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      return desc_.reader().lod_tensor_size();
+      break;
+    default:
+      PADDLE_THROW(
+          "Getting 'sub_tensor_number' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+void VarDesc::SetShapes(
+    const std::vector<std::vector<int64_t>> &multiple_dims) {
+  if (multiple_dims.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_dims.size());
+  }
+  std::vector<proto::TensorDesc *> tensors = mutable_tensor_descs();
+  for (size_t i = 0; i < multiple_dims.size(); ++i) {
+    VectorToRepeated(multiple_dims[i], tensors[i]->mutable_dims());
+  }
+}
+
+std::vector<int64_t> VarDesc::GetShape() const {
+  return RepeatedToVector(tensor_desc().dims());
+}
+
+std::vector<std::vector<int64_t>> VarDesc::GetShapes() const {
+  std::vector<proto::TensorDesc> descs = tensor_descs();
+  std::vector<std::vector<int64_t>> res;
+  res.reserve(descs.size());
+  for (const auto &tensor_desc : descs) {
+    res.push_back(RepeatedToVector(tensor_desc.dims()));
+  }
+  return res;
+}
+
+void VarDesc::SetDataType(proto::DataType data_type) {
+  mutable_tensor_desc()->set_data_type(data_type);
+}
+
+void VarDesc::SetDataTypes(
+    const std::vector<proto::DataType> &multiple_data_type) {
+  if (multiple_data_type.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given data types("
+            << multiple_data_type.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_data_type.size());
+  }
+  std::vector<proto::TensorDesc *> tensor_descs = mutable_tensor_descs();
+  for (size_t i = 0; i < multiple_data_type.size(); ++i) {
+    tensor_descs[i]->set_data_type(multiple_data_type[i]);
+  }
+}
+
+proto::DataType VarDesc::GetDataType() const {
+  return tensor_desc().data_type();
+}
+
+std::vector<proto::DataType> VarDesc::GetDataTypes() const {
+  std::vector<proto::TensorDesc> descs = tensor_descs();
+  std::vector<proto::DataType> res;
+  res.reserve(descs.size());
+  for (const auto &tensor_desc : descs) {
+    res.push_back(tensor_desc.data_type());
+  }
+  return res;
+}
+
+void VarDesc::SetLoDLevel(int32_t lod_level) {
+  switch (desc_.type()) {
+    case proto::VarDesc::LOD_TENSOR:
+      desc_.mutable_lod_tensor()->set_lod_level(lod_level);
+      break;
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
+      desc_.mutable_tensor_array()->set_lod_level(lod_level);
+      break;
+    default:
+      PADDLE_THROW(
+          "Setting 'lod_level' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
+  if (multiple_lod_level.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given lod_levels("
+            << multiple_lod_level.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_lod_level.size());
+  }
+  switch (desc_.type()) {
+    case proto::VarDesc::READER: {
+      size_t i = 0;
+      for (auto &lod_tensor : *desc_.mutable_reader()->mutable_lod_tensor()) {
+        lod_tensor.set_lod_level(multiple_lod_level[i++]);
+      }
+    } break;
+    default:
+      PADDLE_THROW(
+          "Setting 'lod_levels' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+int32_t VarDesc::GetLoDLevel() const {
+  switch (desc_.type()) {
+    case proto::VarDesc::LOD_TENSOR:
+      return desc_.lod_tensor().lod_level();
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.tensor_array().lod_level();
+    default:
+      PADDLE_THROW(
+          "Getting 'lod_level' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+std::vector<int32_t> VarDesc::GetLoDLevels() const {
+  std::vector<int32_t> res;
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      res.reserve(desc_.reader().lod_tensor_size());
+      for (auto &lod_tensor : desc_.reader().lod_tensor()) {
+        res.push_back(lod_tensor.lod_level());
+      }
+      return res;
+      break;
+    default:
+      PADDLE_THROW(
+          "Getting 'lod_levels' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+const proto::TensorDesc &VarDesc::tensor_desc() const {
+  PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
+  switch (desc_.type()) {
+    case proto::VarDesc::SELECTED_ROWS:
+      return desc_.selected_rows();
+    case proto::VarDesc::LOD_TENSOR:
+      return desc_.lod_tensor().tensor();
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.tensor_array().tensor();
+    default:
+      PADDLE_THROW(
+          "Getting 'tensor_desc' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+std::vector<proto::TensorDesc> VarDesc::tensor_descs() const {
+  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
+  std::vector<proto::TensorDesc> res;
+  res.reserve(GetTensorDescNum());
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      for (const auto &lod_tensor : desc_.reader().lod_tensor()) {
+        res.push_back(lod_tensor.tensor());
+      }
+      return res;
+    default:
+      PADDLE_THROW(
+          "Getting 'tensor_descs' is not supported by the type of var "
+          "%s.",
+          this->Name());
+  }
+}
+
+proto::TensorDesc *VarDesc::mutable_tensor_desc() {
+  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
+  switch (desc_.type()) {
+    case proto::VarDesc::SELECTED_ROWS:
+      return desc_.mutable_selected_rows();
+    case proto::VarDesc::LOD_TENSOR:
+      return desc_.mutable_lod_tensor()->mutable_tensor();
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.mutable_tensor_array()->mutable_tensor();
+    default:
+      PADDLE_THROW(
+          "Getting 'mutable_tensor_desc' is not supported by the type of var "
+          "%s.",
+          this->Name());
+  }
+}
+
+std::vector<proto::TensorDesc *> VarDesc::mutable_tensor_descs() {
+  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
+  std::vector<proto::TensorDesc *> res;
+  res.reserve(GetTensorDescNum());
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      for (auto &lod_tensor : *desc_.mutable_reader()->mutable_lod_tensor()) {
+        res.push_back(lod_tensor.mutable_tensor());
+      }
+      return res;
+    default:
+      PADDLE_THROW(
+          "Getting 'tensor_descs' is not supported by the type of var "
+          "%s.",
+          this->Name());
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..cdb1bc3ec09c890f2166190d591b6e6ee8b668a0
--- /dev/null
+++ b/paddle/fluid/framework/var_desc.h
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/framework.pb.h"
+
+namespace paddle {
+namespace framework {
+
+// convert between std::vector and protobuf repeated.
+template <typename T>
+inline std::vector<T> RepeatedToVector(
+    const google::protobuf::RepeatedField<T> &repeated_field) {
+  std::vector<T> ret;
+  ret.reserve(repeated_field.size());
+  std::copy(repeated_field.begin(), repeated_field.end(),
+            std::back_inserter(ret));
+  return ret;
+}
+
+template <typename T, typename RepeatedField>
+inline void VectorToRepeated(const std::vector<T> &vec,
+                             RepeatedField *repeated_field) {
+  repeated_field->Clear();
+  repeated_field->Reserve(vec.size());
+  for (const auto &elem : vec) {
+    *repeated_field->Add() = elem;
+  }
+}
+
+// Specialize vector<bool>.
+template <typename RepeatedField>
+inline void VectorToRepeated(const std::vector<bool> &vec,
+                             RepeatedField *repeated_field) {
+  repeated_field->Clear();
+  repeated_field->Reserve(vec.size());
+  for (auto elem : vec) {
+    *repeated_field->Add() = elem;
+  }
+}
+
+class VarDesc {
+ public:
+  explicit VarDesc(const std::string &name) {
+    desc_.set_name(name);
+    desc_.set_type(proto::VarDesc::LOD_TENSOR);
+  }
+
+  explicit VarDesc(const proto::VarDesc &desc) : desc_(desc) {}
+
+  proto::VarDesc *Proto() { return &desc_; }
+
+  std::string Name() const { return desc_.name(); }
+
+  void SetName(std::string name) { desc_.set_name(name); }
+
+  void SetTensorDescNum(size_t num);
+
+  size_t GetTensorDescNum() const;
+
+  void SetShape(const std::vector<int64_t> &dims);
+
+  void SetShapes(const std::vector<std::vector<int64_t>> &multiple_dims);
+
+  std::vector<int64_t> GetShape() const;
+
+  std::vector<std::vector<int64_t>> GetShapes() const;
+
+  void SetDataType(proto::DataType data_type);
+
+  void SetDataTypes(const std::vector<proto::DataType> &multiple_data_type);
+
+  proto::DataType GetDataType() const;
+
+  std::vector<proto::DataType> GetDataTypes() const;
+
+  void SetLoDLevel(int32_t lod_level);
+
+  void SetLoDLevels(const std::vector<int32_t> &multiple_lod_level);
+
+  int32_t GetLoDLevel() const;
+
+  std::vector<int32_t> GetLoDLevels() const;
+
+  proto::VarDesc::VarType GetType() const;
+
+  void SetType(proto::VarDesc::VarType type);
+
+  bool Persistable() const { return desc_.persistable(); }
+
+  void SetPersistable(bool persistable) { desc_.set_persistable(persistable); }
+
+ private:
+  const proto::TensorDesc &tensor_desc() const;
+  std::vector<proto::TensorDesc> tensor_descs() const;
+  proto::TensorDesc *mutable_tensor_desc();
+  std::vector<proto::TensorDesc *> mutable_tensor_descs();
+
+  proto::VarDesc desc_;
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..2dc4de529814bb0f7a5193b8e216343a4b1b3503
--- /dev/null
+++ b/paddle/fluid/framework/var_type.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+inline proto::VarDesc::VarType ToVarType(std::type_index type) {
+  if (type.hash_code() == typeid(LoDTensor).hash_code()) {
+    return proto::VarDesc_VarType_LOD_TENSOR;
+  } else if (type.hash_code() == typeid(LoDRankTable).hash_code()) {
+    return proto::VarDesc_VarType_LOD_RANK_TABLE;
+  } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
+    return proto::VarDesc_VarType_LOD_TENSOR_ARRAY;
+  } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
+    return proto::VarDesc_VarType_SELECTED_ROWS;
+  } else if (type.hash_code() == typeid(ReaderHolder).hash_code()) {
+    return proto::VarDesc_VarType_READER;
+  } else {
+    PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
+  }
+}
+
+template <typename Visitor>
+inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
+  switch (ToVarType(var.Type())) {
+    case proto::VarDesc_VarType_LOD_TENSOR:
+      visitor(var.Get<LoDTensor>());
+      return;
+    case proto::VarDesc_VarType_LOD_RANK_TABLE:
+      visitor(var.Get<LoDRankTable>());
+      return;
+    case proto::VarDesc_VarType_LOD_TENSOR_ARRAY:
+      visitor(var.Get<LoDTensorArray>());
+      return;
+    case proto::VarDesc_VarType_SELECTED_ROWS:
+      visitor(var.Get<SelectedRows>());
+      return;
+    case proto::VarDesc_VarType_READER:
+      visitor(var.Get<ReaderHolder>());
+      return;
+    default:
+      PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h
new file mode 100644
index 0000000000000000000000000000000000000000..44fd4cd622cbada7f10ade8928a69d4e3d1d9ec0
--- /dev/null
+++ b/paddle/fluid/framework/var_type_inference.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/type_defs.h"
+
+namespace paddle {
+namespace framework {
+
+class VarTypeInference {
+ public:
+  virtual ~VarTypeInference() {}
+  virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0ee589c821a77af7f6714fefd7bebff89218dad8
--- /dev/null
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/var_type_inference.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+
+class SumOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  SumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "").AsDuplicable();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class SumOpVarTypeInference : public VarTypeInference {
+ public:
+  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
+    auto &inputs = op_desc.Input("X");
+    auto default_var_type = proto::VarDesc::SELECTED_ROWS;
+
+    bool any_input_is_lod_tensor = std::any_of(
+        inputs.begin(), inputs.end(), [block](const std::string &name) {
+          return block->Var(name)->GetType() == proto::VarDesc::LOD_TENSOR;
+        });
+    if (any_input_is_lod_tensor) {
+      default_var_type = proto::VarDesc::LOD_TENSOR;
+    }
+
+    auto out_var_name = op_desc.Output("Out").front();
+    block->Var(out_var_name)->SetType(default_var_type);
+  }
+};
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker,
+                  paddle::framework::SumOpVarTypeInference);
+REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP,
+                  paddle::framework::SumOpMaker);
+
+namespace paddle {
+namespace framework {
+
+TEST(InferVarType, sum_op) {
+  ProgramDesc prog;
+  auto *op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("sum");
+  op->SetInput("X", {"test_a", "test_b", "test_c"});
+  op->SetOutput("Out", {"test_out"});
+
+  prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_c")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_out");
+
+  op->InferVarType(prog.MutableBlock(0));
+
+  ASSERT_EQ(proto::VarDesc::SELECTED_ROWS,
+            prog.MutableBlock(0)->Var("test_out")->GetType());
+
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarDesc::LOD_TENSOR);
+  op->InferVarType(prog.MutableBlock(0));
+  ASSERT_EQ(proto::VarDesc::LOD_TENSOR,
+            prog.MutableBlock(0)->Var("test_out")->GetType());
+}
+
+TEST(InferVarType, sum_op_without_infer_var_type) {
+  ProgramDesc prog;
+  auto *op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("sum_without_infer_var_type");
+  op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
+  op->SetOutput("Out", {"test2_out"});
+
+  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_out");
+
+  op->InferVarType(prog.MutableBlock(0));
+
+  ASSERT_EQ(proto::VarDesc_VarType_LOD_TENSOR,
+            prog.MutableBlock(0)->Var("test2_out")->GetType());
+}
+
+}  // namespace framework
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
new file mode 100644
index 0000000000000000000000000000000000000000..9fb8ca92d68203a0bf8ec6ecd30072374b5fe4af
--- /dev/null
+++ b/paddle/fluid/framework/variable.h
@@ -0,0 +1,95 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <memory>
+#include <typeindex>
+#include <typeinfo>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+class Variable {
+ public:
+  template <typename T>
+  const T& Get() const {
+    PADDLE_ENFORCE(holder_ != nullptr, "Variable must hold some thing");
+    PADDLE_ENFORCE(IsType<T>(),
+                   "Variable must be type %s, the holding type is %s",
+                   typeid(T).name(), holder_->Type().name());
+    return *static_cast<const T*>(holder_->Ptr());
+  }
+
+  bool IsInitialized() const { return holder_ != nullptr; }
+
+  template <typename T>
+  T* GetMutable() {
+    if (!IsType<T>()) {
+      holder_.reset(new PlaceholderImpl<T>(new T()));
+    }
+    return static_cast<T*>(holder_->Ptr());
+  }
+
+  template <typename T>
+  bool IsType() const {
+    return holder_ != nullptr &&
+           std::type_index(typeid(T)) == std::type_index(holder_->Type());
+  }
+
+  void Clear() { holder_.reset(); }
+
+  std::type_index Type() const {
+    PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory");
+    return holder_->Type();
+  }
+
+ private:
+  struct Placeholder {
+    virtual ~Placeholder() {}
+    virtual const std::type_info& Type() const = 0;
+    virtual void* Ptr() const = 0;
+  };
+
+  // Placeholder hides type T, so it doesn't appear as a template
+  // parameter of Variable.
+  template <typename T>
+  struct PlaceholderImpl : public Placeholder {
+    PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {}
+
+    virtual const std::type_info& Type() const { return type_; }
+    virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
+
+    std::unique_ptr<T> ptr_;
+    const std::type_info& type_;
+  };
+
+  std::unique_ptr<Placeholder>
+      holder_;  // pointers to a PlaceholderImpl object indeed.
+
+  // name_ is only meaningful with a Scope and accessible by it.
+  //
+  // NOTE: Please don't expose name_ by adding methods like
+  // Variable::Name or Scope::VarName!  A variable could have a human
+  // readable name or an auto-generated scope-unique name.  In the
+  // former case, the caller knows the name and doesn't need to access
+  // the name; in the latter case, the variable should be identified
+  // by its address but not the unreadable name.
+  friend class Scope;
+  const std::string* name_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/variable.md b/paddle/fluid/framework/variable.md
similarity index 100%
rename from paddle/framework/variable.md
rename to paddle/fluid/framework/variable.md
diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8c14e506fd7fd480012135b316479e45bed5584e
--- /dev/null
+++ b/paddle/fluid/framework/variable_test.cc
@@ -0,0 +1,41 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/variable.h"
+
+TEST(Variable, GetMutable) {
+  using paddle::framework::Variable;
+
+  struct Tensor {
+    int content_;
+  };
+
+  std::unique_ptr<Variable> v(new Variable());
+
+  Tensor* t = v->GetMutable<Tensor>();
+  t->content_ = 1234;
+
+  const Tensor& tt = v->Get<Tensor>();
+  EXPECT_EQ(1234, tt.content_);
+
+  std::string* s = v->GetMutable<std::string>();
+  *s = "hello";
+
+  const std::string& ss = v->Get<std::string>();
+  EXPECT_EQ("hello", ss);
+}
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bdb147955ca0700dc0854b54c38d961caf8845f3
--- /dev/null
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -0,0 +1,18 @@
+set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init)
+
+cc_library(paddle_fluid_api
+    SRCS io.cc
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+
+# Create static library
+cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+
+# Create shared library
+cc_library(paddle_fluid_shared SHARED
+    SRCS io.cc
+    DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END)
+set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
+
+if(WITH_TESTING)
+  add_subdirectory(tests/book)
+endif()
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..58d7ab40bfa67595a9c7c61ed431a7cf9509e1f7
--- /dev/null
+++ b/paddle/fluid/inference/io.cc
@@ -0,0 +1,139 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/io.h"
+
+#include <fstream>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+
+namespace paddle {
+namespace inference {
+
+void ReadBinaryFile(const std::string& filename, std::string& contents) {
+  VLOG(3) << "loading model from " << filename;
+  std::ifstream inputfs(filename, std::ios::in | std::ios::binary);
+  inputfs.seekg(0, std::ios::end);
+  contents.clear();
+  contents.resize(inputfs.tellg());
+  inputfs.seekg(0, std::ios::beg);
+  inputfs.read(&contents[0], contents.size());
+  inputfs.close();
+}
+
+bool IsParameter(const framework::VarDesc* var,
+                 const framework::ProgramDesc& main_program) {
+  if (var->Persistable()) {
+    // There are many unreachable variables in the program
+    for (size_t i = 0; i < main_program.Size(); ++i) {
+      const framework::BlockDesc& block = main_program.Block(i);
+      for (auto* op : block.AllOps()) {
+        if (op->Type() == framework::kFeedOpType) {
+          continue;
+        }
+        for (auto input_argument_name : op->InputArgumentNames()) {
+          if (input_argument_name == var->Name()) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
+void LoadPersistables(framework::Executor& executor,
+                      framework::Scope& scope,
+                      const framework::ProgramDesc& main_program,
+                      const std::string& dirname,
+                      const std::string& param_filename) {
+  const framework::BlockDesc& global_block = main_program.Block(0);
+
+  framework::ProgramDesc* load_program = new framework::ProgramDesc();
+  framework::BlockDesc* load_block = load_program->MutableBlock(0);
+  std::vector<std::string> paramlist;
+
+  for (auto* var : global_block.AllVars()) {
+    if (IsParameter(var, main_program)) {
+      VLOG(3) << "parameter's name: " << var->Name();
+
+      framework::VarDesc* new_var = load_block->Var(var->Name());
+      new_var->SetShape(var->GetShape());
+      new_var->SetDataType(var->GetDataType());
+      new_var->SetType(var->GetType());
+      new_var->SetLoDLevel(var->GetLoDLevel());
+      new_var->SetPersistable(true);
+
+      if (!param_filename.empty()) {
+        paramlist.push_back(new_var->Name());
+      } else {
+        // append_op
+        framework::OpDesc* op = load_block->AppendOp();
+        op->SetType("load");
+        op->SetOutput("Out", {new_var->Name()});
+        op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
+        op->CheckAttrs();
+      }
+    }
+  }
+
+  if (!param_filename.empty()) {
+    // sort paramlist to have consistent ordering
+    std::sort(paramlist.begin(), paramlist.end());
+    // append just the load_combine op
+    framework::OpDesc* op = load_block->AppendOp();
+    op->SetType("load_combine");
+    op->SetOutput("Out", paramlist);
+    op->SetAttr("file_path", {param_filename});
+    op->CheckAttrs();
+  }
+
+  executor.Run(*load_program, &scope, 0, true, true);
+
+  VLOG(3) << "Ran loading successfully";
+  delete load_program;
+}
+
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
+                                             framework::Scope& scope,
+                                             const std::string& dirname) {
+  std::string model_filename = dirname + "/__model__";
+  std::string program_desc_str;
+  ReadBinaryFile(model_filename, program_desc_str);
+
+  std::unique_ptr<framework::ProgramDesc> main_program(
+      new framework::ProgramDesc(program_desc_str));
+
+  LoadPersistables(executor, scope, *main_program, dirname, "");
+  return main_program;
+}
+
+std::unique_ptr<framework::ProgramDesc> Load(
+    framework::Executor& executor,
+    framework::Scope& scope,
+    const std::string& prog_filename,
+    const std::string& param_filename) {
+  std::string model_filename = prog_filename;
+  std::string program_desc_str;
+  ReadBinaryFile(model_filename, program_desc_str);
+
+  std::unique_ptr<framework::ProgramDesc> main_program(
+      new framework::ProgramDesc(program_desc_str));
+
+  LoadPersistables(executor, scope, *main_program, "", param_filename);
+  return main_program;
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d7864060646d9a480ce6ced6a1f4364e83938c0
--- /dev/null
+++ b/paddle/fluid/inference/io.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace inference {
+
+void LoadPersistables(framework::Executor& executor,
+                      framework::Scope& scope,
+                      const framework::ProgramDesc& main_program,
+                      const std::string& dirname,
+                      const std::string& param_filename);
+
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
+                                             framework::Scope& scope,
+                                             const std::string& dirname);
+
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
+                                             framework::Scope& scope,
+                                             const std::string& prog_filename,
+                                             const std::string& param_filename);
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9fe76afb582a13b741ab086f0c62d77e86d4e8bb
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -0,0 +1,34 @@
+function(inference_test TARGET_NAME)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs ARGS)
+  cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
+  set(arg_list "")
+  if(inference_test_ARGS)
+    foreach(arg ${inference_test_ARGS})
+      list(APPEND arg_list "_${arg}")
+    endforeach()
+  else()
+    list(APPEND arg_list "_")
+  endif()
+  foreach(arg ${arg_list})
+    string(REGEX REPLACE "^_$" "" arg "${arg}")
+    cc_test(test_inference_${TARGET_NAME}${arg}
+        SRCS test_inference_${TARGET_NAME}.cc
+        DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+        ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
+    set_tests_properties(test_inference_${TARGET_NAME}${arg}
+        PROPERTIES DEPENDS test_${TARGET_NAME})
+  endforeach()
+endfunction(inference_test)
+
+inference_test(fit_a_line)
+inference_test(image_classification ARGS vgg resnet)
+inference_test(label_semantic_roles)
+inference_test(recognize_digits ARGS mlp)
+inference_test(recommender_system)
+inference_test(rnn_encoder_decoder)
+inference_test(understand_sentiment)
+inference_test(word2vec)
diff --git a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fa18e69b3ac7e984172dd14a3cb8d48158dfb471
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, fit_a_line) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor input;
+  // The second dim of the input tensor should be 13
+  // The input data should be >= 0
+  int64_t batch_size = 10;
+  SetupTensor<float>(
+      input, {batch_size, 13}, static_cast<float>(0), static_cast<float>(10));
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
new file mode 100644
index 0000000000000000000000000000000000000000..27f17712bca4103e2556cb375339ca785f53bd4f
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, image_classification) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  int64_t batch_size = 1;
+
+  paddle::framework::LoDTensor input;
+  // Use normilized image pixels as input data,
+  // which should be in the range [0.0, 1.0].
+  SetupTensor<float>(input,
+                     {batch_size, 3, 32, 32},
+                     static_cast<float>(0),
+                     static_cast<float>(1));
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55acd95f50906b13a5a906e0bcc2e73a0c7f8ef2
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, label_semantic_roles) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1,
+      ctx_p2, mark;
+  paddle::framework::LoD lod{{0, 4, 10}};
+
+  SetupLoDTensor(word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(
+      predicate, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(ctx_n2, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(ctx_n1, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(ctx_0, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(ctx_p1, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(ctx_p2, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(mark, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&word);
+  cpu_feeds.push_back(&predicate);
+  cpu_feeds.push_back(&ctx_n2);
+  cpu_feeds.push_back(&ctx_n1);
+  cpu_feeds.push_back(&ctx_0);
+  cpu_feeds.push_back(&ctx_p1);
+  cpu_feeds.push_back(&ctx_p2);
+  cpu_feeds.push_back(&mark);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.lod();
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.lod();
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
new file mode 100644
index 0000000000000000000000000000000000000000..99cf0f3095bf7f93d53272e0ae13242484d7128c
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
@@ -0,0 +1,105 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, recognize_digits) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  int64_t batch_size = 1;
+
+  paddle::framework::LoDTensor input;
+  // Use normilized image pixels as input data,
+  // which should be in the range [-1.0, 1.0].
+  SetupTensor<float>(input,
+                     {batch_size, 1, 28, 28},
+                     static_cast<float>(-1),
+                     static_cast<float>(1));
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
+
+TEST(inference, recognize_digits_combine) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor input;
+  // Use normilized image pixels as input data,
+  // which should be in the range [-1.0, 1.0].
+  SetupTensor<float>(
+      input, {1, 28, 28}, static_cast<float>(-1), static_cast<float>(1));
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace, true>(
+      dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace, true>(
+      dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9208c2a59965ad5296238a23e89cd290b5e19740
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, recommender_system) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  int64_t batch_size = 1;
+
+  paddle::framework::LoDTensor user_id, gender_id, age_id, job_id, movie_id,
+      category_id, movie_title;
+
+  // Use the first data from paddle.dataset.movielens.test() as input
+  std::vector<int64_t> user_id_data = {1};
+  SetupTensor<int64_t>(user_id, {batch_size, 1}, user_id_data);
+
+  std::vector<int64_t> gender_id_data = {1};
+  SetupTensor<int64_t>(gender_id, {batch_size, 1}, gender_id_data);
+
+  std::vector<int64_t> age_id_data = {0};
+  SetupTensor<int64_t>(age_id, {batch_size, 1}, age_id_data);
+
+  std::vector<int64_t> job_id_data = {10};
+  SetupTensor<int64_t>(job_id, {batch_size, 1}, job_id_data);
+
+  std::vector<int64_t> movie_id_data = {783};
+  SetupTensor<int64_t>(movie_id, {batch_size, 1}, movie_id_data);
+
+  std::vector<int64_t> category_id_data = {10, 8, 9};
+  SetupLoDTensor<int64_t>(category_id, {3, 1}, {{0, 3}}, category_id_data);
+
+  std::vector<int64_t> movie_title_data = {1069, 4140, 2923, 710, 988};
+  SetupLoDTensor<int64_t>(movie_title, {5, 1}, {{0, 5}}, movie_title_data);
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&user_id);
+  cpu_feeds.push_back(&gender_id);
+  cpu_feeds.push_back(&age_id);
+  cpu_feeds.push_back(&job_id);
+  cpu_feeds.push_back(&movie_id);
+  cpu_feeds.push_back(&category_id);
+  cpu_feeds.push_back(&movie_title);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c88ca30cb781c1980d960c5e4e1137dcfd54afac
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, rnn_encoder_decoder) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor word_data, trg_word;
+  paddle::framework::LoD lod{{0, 4, 10}};
+
+  SetupLoDTensor(
+      word_data, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(
+      trg_word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&word_data);
+  cpu_feeds.push_back(&trg_word);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.lod();
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.lod();
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b29d52880cef1710696074ed8b2fdecf4f9fcca
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, understand_sentiment) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor words;
+  paddle::framework::LoD lod{{0, 4, 10}};
+  SetupLoDTensor(words, lod, static_cast<int64_t>(0), static_cast<int64_t>(10));
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&words);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.lod();
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.lod();
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
new file mode 100644
index 0000000000000000000000000000000000000000..93376b6824daf000dd9996c17ca9737b5b600e10
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, word2vec) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor first_word, second_word, third_word, fourth_word;
+  paddle::framework::LoD lod{{0, 1}};
+  int64_t dict_size = 2072;  // Hard-coding the size of dictionary
+
+  SetupLoDTensor(first_word, lod, static_cast<int64_t>(0), dict_size);
+  SetupLoDTensor(second_word, lod, static_cast<int64_t>(0), dict_size);
+  SetupLoDTensor(third_word, lod, static_cast<int64_t>(0), dict_size);
+  SetupLoDTensor(fourth_word, lod, static_cast<int64_t>(0), dict_size);
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&first_word);
+  cpu_feeds.push_back(&second_word);
+  cpu_feeds.push_back(&third_word);
+  cpu_feeds.push_back(&fourth_word);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.lod();
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.lod();
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..a6c93aa0737f79ca1d626862256d3c79a36868ae
--- /dev/null
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -0,0 +1,140 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <time.h>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/io.h"
+
+template <typename T>
+void SetupTensor(paddle::framework::LoDTensor& input,
+                 paddle::framework::DDim dims,
+                 T lower,
+                 T upper) {
+  srand(time(0));
+  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
+  for (int i = 0; i < input.numel(); ++i) {
+    input_ptr[i] =
+        (static_cast<T>(rand()) / static_cast<T>(RAND_MAX)) * (upper - lower) +
+        lower;
+  }
+}
+
+template <typename T>
+void SetupTensor(paddle::framework::LoDTensor& input,
+                 paddle::framework::DDim dims,
+                 std::vector<T>& data) {
+  CHECK_EQ(paddle::framework::product(dims), static_cast<int64_t>(data.size()));
+  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
+  memcpy(input_ptr, data.data(), input.numel() * sizeof(T));
+}
+
+template <typename T>
+void SetupLoDTensor(paddle::framework::LoDTensor& input,
+                    paddle::framework::LoD& lod,
+                    T lower,
+                    T upper) {
+  input.set_lod(lod);
+  int dim = lod[0][lod[0].size() - 1];
+  SetupTensor<T>(input, {dim, 1}, lower, upper);
+}
+
+template <typename T>
+void SetupLoDTensor(paddle::framework::LoDTensor& input,
+                    paddle::framework::DDim dims,
+                    paddle::framework::LoD lod,
+                    std::vector<T>& data) {
+  const size_t level = lod.size() - 1;
+  CHECK_EQ(dims[0], static_cast<int64_t>((lod[level]).back()));
+  input.set_lod(lod);
+  SetupTensor<T>(input, dims, data);
+}
+
+template <typename T>
+void CheckError(paddle::framework::LoDTensor& output1,
+                paddle::framework::LoDTensor& output2) {
+  // Check lod information
+  EXPECT_EQ(output1.lod(), output2.lod());
+
+  EXPECT_EQ(output1.dims(), output2.dims());
+  EXPECT_EQ(output1.numel(), output2.numel());
+
+  T err = static_cast<T>(0);
+  if (typeid(T) == typeid(float)) {
+    err = 1E-3;
+  } else if (typeid(T) == typeid(double)) {
+    err = 1E-6;
+  } else {
+    err = 0;
+  }
+
+  size_t count = 0;
+  for (int64_t i = 0; i < output1.numel(); ++i) {
+    if (fabs(output1.data<T>()[i] - output2.data<T>()[i]) > err) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
+}
+
+template <typename Place, bool IsCombined = false>
+void TestInference(const std::string& dirname,
+                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
+                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
+  // 1. Define place, executor, scope
+  auto place = Place();
+  auto executor = paddle::framework::Executor(place);
+  auto* scope = new paddle::framework::Scope();
+
+  // 2. Initialize the inference_program and load parameters
+  std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
+  if (IsCombined) {
+    // All parameters are saved in a single file.
+    // Hard-coding the file names of program and parameters in unittest.
+    // Users are free to specify different filename
+    // (provided: the filenames are changed in the python api as well: io.py)
+    std::string prog_filename = "__model_combined__";
+    std::string param_filename = "__params_combined__";
+    inference_program = paddle::inference::Load(executor,
+                                                *scope,
+                                                dirname + "/" + prog_filename,
+                                                dirname + "/" + param_filename);
+  } else {
+    // Parameters are saved in separate files sited in the specified `dirname`.
+    inference_program = paddle::inference::Load(executor, *scope, dirname);
+  }
+
+  // 3. Get the feed_target_names and fetch_target_names
+  const std::vector<std::string>& feed_target_names =
+      inference_program->GetFeedTargetNames();
+  const std::vector<std::string>& fetch_target_names =
+      inference_program->GetFetchTargetNames();
+
+  // 4. Prepare inputs: set up maps for feed targets
+  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+  for (size_t i = 0; i < feed_target_names.size(); ++i) {
+    // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+    feed_targets[feed_target_names[i]] = cpu_feeds[i];
+  }
+
+  // 5. Define Tensor to get the outputs: set up maps for fetch targets
+  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+  for (size_t i = 0; i < fetch_target_names.size(); ++i) {
+    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
+  }
+
+  // 6. Run the inference program
+  executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+
+  delete scope;
+}
diff --git a/paddle/platform/.clang-format b/paddle/fluid/memory/.clang-format
similarity index 100%
rename from paddle/platform/.clang-format
rename to paddle/fluid/memory/.clang-format
diff --git a/paddle/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
similarity index 100%
rename from paddle/memory/CMakeLists.txt
rename to paddle/fluid/memory/CMakeLists.txt
diff --git a/paddle/memory/README.md b/paddle/fluid/memory/README.md
similarity index 100%
rename from paddle/memory/README.md
rename to paddle/fluid/memory/README.md
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt
similarity index 100%
rename from paddle/memory/detail/CMakeLists.txt
rename to paddle/fluid/memory/detail/CMakeLists.txt
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2cee8271d27014815b19175ef93759d6a07b7e73
--- /dev/null
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -0,0 +1,329 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/detail/buddy_allocator.h"
+#include "glog/logging.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
+                               size_t min_chunk_size, size_t max_chunk_size)
+    : min_chunk_size_(min_chunk_size),
+      max_chunk_size_(max_chunk_size),
+      cache_(system_allocator->UseGpu()),
+      system_allocator_(std::move(system_allocator)) {}
+
+BuddyAllocator::~BuddyAllocator() {
+  VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these "
+              "have actually been freed";
+  while (!pool_.empty()) {
+    auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
+    VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+    pool_.erase(pool_.begin());
+  }
+}
+
+inline size_t align(size_t size, size_t alignment) {
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+void* BuddyAllocator::Alloc(size_t unaligned_size) {
+  // adjust allocation alignment
+  size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_);
+
+  // acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  VLOG(10) << "Allocate " << unaligned_size << " bytes from chunk size "
+           << size;
+
+  // if the allocation is huge, send directly to the system allocator
+  if (size > max_chunk_size_) {
+    VLOG(10) << "Allocate from system allocator.";
+    return SystemAlloc(size);
+  }
+
+  // query and allocate from the existing chunk
+  auto it = FindExistChunk(size);
+
+  // refill the pool if failure
+  if (it == pool_.end()) {
+    it = RefillPool();
+    // if still failure, fail fatally
+    if (it == pool_.end()) {
+      return nullptr;
+    }
+  } else {
+    VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it)
+             << " at address "
+             << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
+  }
+
+  total_used_ += size;
+  total_free_ -= size;
+
+  // split the allocation and return data for use
+  return reinterpret_cast<MemoryBlock*>(SplitToAlloc(it, size))->data();
+}
+
+void BuddyAllocator::Free(void* p) {
+  // Point back to metadata
+  auto block = static_cast<MemoryBlock*>(p)->metadata();
+
+  // Acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  VLOG(10) << "Free from address " << block;
+
+  if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
+    VLOG(10) << "Free directly from system allocator";
+    system_allocator_->Free(block, block->total_size(cache_),
+                            block->index(cache_));
+
+    // Invalidate GPU allocation from cache
+    cache_.invalidate(block);
+
+    return;
+  }
+
+  block->mark_as_free(cache_);
+
+  total_used_ -= block->total_size(cache_);
+  total_free_ += block->total_size(cache_);
+
+  // Trying to merge the right buddy
+  if (block->has_right_buddy(cache_)) {
+    VLOG(10) << "Merging this block " << block << " with its right buddy "
+             << block->right_buddy(cache_);
+
+    auto right_buddy = block->right_buddy(cache_);
+
+    if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      // Take away right buddy from pool
+      pool_.erase(IndexSizeAddress(right_buddy->index(cache_),
+                                   right_buddy->total_size(cache_),
+                                   right_buddy));
+
+      // merge its right buddy to the block
+      block->merge(cache_, right_buddy);
+    }
+  }
+
+  // Trying to merge the left buddy
+  if (block->has_left_buddy(cache_)) {
+    VLOG(10) << "Merging this block " << block << " with its left buddy "
+             << block->left_buddy(cache_);
+
+    auto left_buddy = block->left_buddy(cache_);
+
+    if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      // Take away right buddy from pool
+      pool_.erase(IndexSizeAddress(left_buddy->index(cache_),
+                                   left_buddy->total_size(cache_), left_buddy));
+
+      // merge the block to its left buddy
+      left_buddy->merge(cache_, block);
+      block = left_buddy;
+    }
+  }
+
+  // Dumping this block into pool
+  VLOG(10) << "Inserting free block (" << block << ", "
+           << block->total_size(cache_) << ")";
+  pool_.insert(
+      IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
+
+  // Clean up if existing too much free memory
+
+  // Prefer freeing fallback allocation first
+  CleanIdleFallBackAlloc();
+
+  // Free normal allocation
+  CleanIdleNormalAlloc();
+}
+
+size_t BuddyAllocator::Used() { return total_used_; }
+
+void* BuddyAllocator::SystemAlloc(size_t size) {
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, size);
+
+  VLOG(10) << "Allocated " << p << " from system allocator.";
+
+  if (p == nullptr) return nullptr;
+
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index,
+                                     size, nullptr, nullptr);
+
+  return static_cast<MemoryBlock*>(p)->data();
+}
+
+BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
+#ifdef PADDLE_WITH_CUDA
+  if (system_allocator_->UseGpu()) {
+    if ((total_used_ + total_free_) == 0) {
+      // Compute the maximum allocation size for the first allocation.
+      max_chunk_size_ = platform::GpuMaxChunkSize();
+    }
+  }
+#endif
+
+  // Allocate a new maximum sized block
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, max_chunk_size_);
+
+  if (p == nullptr) return pool_.end();
+
+  VLOG(10) << "Creating and inserting new block " << p
+           << " from system allocator";
+
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
+                                     max_chunk_size_, nullptr, nullptr);
+
+  // gpu fallback allocation
+  if (system_allocator_->UseGpu() &&
+      static_cast<MemoryBlock*>(p)->index(cache_) == 1) {
+    fallback_alloc_count_++;
+  }
+
+  total_free_ += max_chunk_size_;
+
+  // dump the block into pool
+  return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
+}
+
+BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
+  size_t index = 0;
+
+  while (1) {
+    auto it = pool_.lower_bound(IndexSizeAddress(index, size, nullptr));
+
+    // no match chunk memory
+    if (it == pool_.end()) return it;
+
+    if (std::get<0>(*it) > index) {
+      // find suitable one
+      if (std::get<1>(*it) >= size) {
+        return it;
+      }
+      // update and continue
+      index = std::get<0>(*it);
+      continue;
+    }
+    return it;
+  }
+}
+
+void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
+                                   size_t size) {
+  auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
+  pool_.erase(it);
+
+  VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_)
+           << ") into";
+  block->split(cache_, size);
+
+  VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_)
+           << ")";
+  block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
+
+  // the rest of memory if exist
+  if (block->has_right_buddy(cache_)) {
+    if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      VLOG(10) << "Insert right block (" << block->right_buddy(cache_) << ", "
+               << block->right_buddy(cache_)->total_size(cache_) << ")";
+
+      pool_.insert(
+          IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
+                           block->right_buddy(cache_)->total_size(cache_),
+                           block->right_buddy(cache_)));
+    }
+  }
+
+  return block;
+}
+
+void BuddyAllocator::CleanIdleFallBackAlloc() {
+  // If fallback allocation does not exist, return directly
+  if (!fallback_alloc_count_) return;
+
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+
+    // If no GPU fallback allocator, return
+    if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
+      return;
+    }
+
+    VLOG(10) << "Return block " << block << " to fallback allocator.";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+
+    total_free_ -= max_chunk_size_;
+    fallback_alloc_count_--;
+
+    // If no fall allocation exists, return directly
+    if (!fallback_alloc_count_) return;
+  }
+}
+
+void BuddyAllocator::CleanIdleNormalAlloc() {
+  auto shall_free_alloc = [&]() -> bool {
+    // free all fallback allocations
+    if (fallback_alloc_count_ > 0) {
+      return true;
+    }
+    // keep 2x overhead if we haven't fallen back
+    if ((total_used_ + max_chunk_size_) * 2 < total_free_) {
+      return true;
+    }
+    return false;
+  };
+
+  if (!shall_free_alloc()) return;
+
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+
+    VLOG(10) << "Return block " << block << " to base allocator.";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+
+    total_free_ -= max_chunk_size_;
+
+    if (!shall_free_alloc()) return;
+  }
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..644d79330680787f717920652708c0dd5bee1833
--- /dev/null
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/memory/detail/meta_cache.h"
+#include "paddle/fluid/memory/detail/meta_data.h"
+#include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+#include <mutex>
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+class BuddyAllocator {
+ public:
+  BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size,
+                 size_t max_chunk_size);
+
+  ~BuddyAllocator();
+
+ public:
+  void* Alloc(size_t unaligned_size);
+  void Free(void* ptr);
+  size_t Used();
+
+ public:
+  // Disable copy and assignment
+  BuddyAllocator(const BuddyAllocator&) = delete;
+  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
+
+ private:
+  // Tuple (allocator index, memory size, memory address)
+  using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
+  // Each element in PoolSet is a free allocation
+  using PoolSet = std::set<IndexSizeAddress>;
+
+  /*! \brief Allocate fixed-size memory from system */
+  void* SystemAlloc(size_t size);
+
+  /*! \brief If existing chunks are not suitable, refill pool */
+  PoolSet::iterator RefillPool();
+
+  /**
+   *  \brief   Find the suitable chunk from existing pool and split
+   *           it to left and right buddies
+   *
+   *  \param   it     the iterator of pool list
+   *  \param   size   the size of allocation
+   *
+   *  \return  the left buddy address
+   */
+  void* SplitToAlloc(PoolSet::iterator it, size_t size);
+
+  /*! \brief Find the existing chunk which used to allocation */
+  PoolSet::iterator FindExistChunk(size_t size);
+
+  /*! \brief Clean idle fallback allocation */
+  void CleanIdleFallBackAlloc();
+
+  /*! \brief Clean idle normal allocation */
+  void CleanIdleNormalAlloc();
+
+ private:
+  size_t total_used_ = 0;  // the total size of used memory
+  size_t total_free_ = 0;  // the total size of free memory
+
+  size_t min_chunk_size_;  // the minimum size of each chunk
+  size_t max_chunk_size_;  // the maximum size of each chunk
+
+ private:
+  /**
+   * \brief A list of free allocation
+   *
+   * \note  Only store free chunk memory in pool
+   */
+  PoolSet pool_;
+
+  /*! Record fallback allocation count for auto-scaling */
+  size_t fallback_alloc_count_ = 0;
+
+ private:
+  /*! Unify the metadata format between GPU and CPU allocations */
+  MetadataCache cache_;
+
+ private:
+  /*! Allocate CPU/GPU memory from system */
+  SystemAllocator* system_allocator_;
+  std::mutex mutex_;
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/memory_block.cc b/paddle/fluid/memory/detail/memory_block.cc
new file mode 100644
index 0000000000000000000000000000000000000000..23388cdd5b7c44ff91e10aadaa8cc25d8ef29d14
--- /dev/null
+++ b/paddle/fluid/memory/detail/memory_block.cc
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/detail/memory_block.h"
+#include "paddle/fluid/memory/detail/meta_cache.h"
+#include "paddle/fluid/memory/detail/meta_data.h"
+#include "paddle/fluid/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size,
+                       void* left_buddy, void* right_buddy) {
+  cache.store(this, Metadata(t, index, size - sizeof(Metadata), size,
+                             static_cast<MemoryBlock*>(left_buddy),
+                             static_cast<MemoryBlock*>(right_buddy)));
+}
+
+MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const {
+  return cache.load(this).type;
+}
+
+size_t MemoryBlock::size(MetadataCache& cache) const {
+  return cache.load(this).size;
+}
+
+size_t MemoryBlock::total_size(MetadataCache& cache) const {
+  return cache.load(this).total_size;
+}
+
+MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const {
+  return cache.load(this).left_buddy;
+}
+
+MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const {
+  return cache.load(this).right_buddy;
+}
+
+void MemoryBlock::split(MetadataCache& cache, size_t size) {
+  // make sure the split fits
+  PADDLE_ASSERT(total_size(cache) >= size);
+
+  // bail out if there is no room for another partition
+  if (total_size(cache) - size <= sizeof(Metadata)) {
+    return;
+  }
+
+  // find the position of the split
+  void* right_partition = reinterpret_cast<uint8_t*>(this) + size;
+
+  size_t remaining_size = total_size(cache) - size;
+
+  // Add the new block as a buddy
+  auto metadata = cache.load(this);
+
+  // Write the metadata for the new block
+  auto new_block_right_buddy = metadata.right_buddy;
+
+  cache.store(
+      static_cast<MemoryBlock*>(right_partition),
+      Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata),
+               remaining_size, this, new_block_right_buddy));
+
+  metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
+  metadata.size = size - sizeof(Metadata);
+  metadata.total_size = size;
+
+  cache.store(this, metadata);
+
+  // Write metadata for the new block's right buddy
+  if (new_block_right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(new_block_right_buddy);
+
+    buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition);
+
+    cache.store(new_block_right_buddy, buddy_metadata);
+  }
+}
+
+void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
+  // only free blocks can be merged
+  PADDLE_ASSERT(type(cache) == FREE_CHUNK);
+  PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK);
+
+  auto metadata = cache.load(this);
+
+  // link this->buddy's buddy
+  metadata.right_buddy = right_buddy->right_buddy(cache);
+
+  // link buddy's buddy -> this
+  if (metadata.right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(metadata.right_buddy);
+
+    buddy_metadata.left_buddy = this;
+
+    cache.store(metadata.right_buddy, buddy_metadata);
+  }
+
+  metadata.size += right_buddy->total_size(cache);
+  metadata.total_size += right_buddy->total_size(cache);
+
+  cache.store(this, metadata);
+  cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
+}
+
+void MemoryBlock::mark_as_free(MetadataCache& cache) {
+  // check for double free or corruption
+  PADDLE_ASSERT(type(cache) != FREE_CHUNK);
+  PADDLE_ASSERT(type(cache) != INVALID_CHUNK);
+
+  set_type(cache, FREE_CHUNK);
+}
+
+void MemoryBlock::set_type(MetadataCache& cache, Type t) {
+  auto metadata = cache.load(this);
+
+  metadata.type = t;
+
+  cache.store(this, metadata);
+}
+
+bool MemoryBlock::has_left_buddy(MetadataCache& cache) const {
+  return left_buddy(cache) != nullptr;
+}
+
+bool MemoryBlock::has_right_buddy(MetadataCache& cache) const {
+  return right_buddy(cache) != nullptr;
+}
+
+size_t MemoryBlock::index(MetadataCache& cache) const {
+  return cache.load(this).index;
+}
+
+void* MemoryBlock::data() const {
+  return const_cast<Metadata*>(reinterpret_cast<const Metadata*>(this)) + 1;
+}
+
+MemoryBlock* MemoryBlock::metadata() const {
+  return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
+      reinterpret_cast<const Metadata*>(this) - 1));
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/memory_block.h b/paddle/fluid/memory/detail/memory_block.h
similarity index 100%
rename from paddle/memory/detail/memory_block.h
rename to paddle/fluid/memory/detail/memory_block.h
diff --git a/paddle/fluid/memory/detail/meta_cache.cc b/paddle/fluid/memory/detail/meta_cache.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7d78811c7715b906aea1b88c13a4c3939db6387d
--- /dev/null
+++ b/paddle/fluid/memory/detail/meta_cache.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/detail/meta_cache.h"
+#include "glog/logging.h"
+#include "paddle/fluid/memory/detail/memory_block.h"
+#include "paddle/fluid/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
+
+Metadata MetadataCache::load(const MemoryBlock* block) {
+  if (uses_gpu_) {
+    auto existing_metadata = cache_.find(block);
+    PADDLE_ASSERT(existing_metadata->second.check_guards());
+    return existing_metadata->second;
+  } else {
+    auto* meta = reinterpret_cast<const Metadata*>(block);
+    VLOG(10) << "Load MetaData type=" << meta->type;
+    PADDLE_ASSERT(meta->check_guards());
+    return *reinterpret_cast<const Metadata*>(block);
+  }
+}
+
+void MetadataCache::store(MemoryBlock* block,
+                          const Metadata& original_metadata) {
+  auto metadata = original_metadata;
+
+  metadata.update_guards();
+
+  if (uses_gpu_) {
+    cache_[block] = metadata;
+  } else {
+    *reinterpret_cast<Metadata*>(block) = metadata;
+  }
+}
+
+void MetadataCache::invalidate(MemoryBlock* block) {
+  if (uses_gpu_) {
+    cache_.erase(block);
+  }
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/meta_cache.h b/paddle/fluid/memory/detail/meta_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..635d6398e697de80d0606a200c2634a93468199d
--- /dev/null
+++ b/paddle/fluid/memory/detail/meta_cache.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/memory/detail/memory_block.h"
+#include "paddle/fluid/memory/detail/meta_data.h"
+
+#include <unordered_map>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+/**
+ *  \brief A cache for accessing memory block meta-data that may be expensive
+ *         to access directly.
+ *
+ *  \note  This class exists to unify the metadata format between GPU and CPU
+ *         allocations. It should be removed when the CPU can access all GPU
+ *         allocations directly via UVM.
+ */
+class MetadataCache {
+ public:
+  explicit MetadataCache(bool uses_gpu);
+
+ public:
+  /*! \brief Load the associated metadata for the specified memory block. */
+  Metadata load(const MemoryBlock* memory_block);
+
+  /*! \brief Store the associated metadata for the specified memory block. */
+  void store(MemoryBlock* memory_block, const Metadata& meta_data);
+
+  /*! \brief Indicate that the specified metadata will no longer be used. */
+  void invalidate(MemoryBlock* memory_block);
+
+ public:
+  MetadataCache(const MetadataCache&) = delete;
+  MetadataCache& operator=(const MetadataCache&) = delete;
+
+ private:
+  bool uses_gpu_;
+
+ private:
+  typedef std::unordered_map<const MemoryBlock*, Metadata> MetadataMap;
+
+ private:
+  MetadataMap cache_;
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/meta_data.cc b/paddle/fluid/memory/detail/meta_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eae49ebdcffd03eeb192fb7e859666027336245b
--- /dev/null
+++ b/paddle/fluid/memory/detail/meta_data.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/detail/meta_data.h"
+
+#include <functional>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
+                   MemoryBlock* l, MemoryBlock* r)
+    : type(t),
+      index(i),
+      size(s),
+      total_size(ts),
+      left_buddy(l),
+      right_buddy(r) {}
+
+Metadata::Metadata()
+    : type(MemoryBlock::INVALID_CHUNK),
+      index(0),
+      size(0),
+      total_size(0),
+      left_buddy(nullptr),
+      right_buddy(nullptr) {}
+
+template <class T>
+inline void hash_combine(std::size_t& seed, const T& v) {
+  std::hash<T> hasher;
+  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
+inline size_t hash(const Metadata* metadata, size_t initial_seed) {
+  size_t seed = initial_seed;
+
+  hash_combine(seed, (size_t)metadata->type);
+  hash_combine(seed, metadata->index);
+  hash_combine(seed, metadata->size);
+  hash_combine(seed, metadata->total_size);
+  hash_combine(seed, metadata->left_buddy);
+  hash_combine(seed, metadata->right_buddy);
+
+  return seed;
+}
+
+void Metadata::update_guards() {
+  guard_begin = hash(this, 1);
+  guard_end = hash(this, 2);
+}
+
+bool Metadata::check_guards() const {
+  return guard_begin == hash(this, 1) && guard_end == hash(this, 2);
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/meta_data.h b/paddle/fluid/memory/detail/meta_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..368523701ef1a7b3bd869e1f0542c42c61448b40
--- /dev/null
+++ b/paddle/fluid/memory/detail/meta_data.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/memory/detail/memory_block.h"
+
+#include <stddef.h>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+class Metadata {
+ public:
+  Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
+           MemoryBlock* r);
+  Metadata();
+
+ public:
+  /*! \brief Update the guards when metadata is changed */
+  void update_guards();
+
+  /*! \brief Check consistency to previous modification */
+  bool check_guards() const;
+
+ public:
+  // TODO(gangliao): compress this
+  // clang-format off
+  size_t            guard_begin = 0;
+  MemoryBlock::Type type        = MemoryBlock::INVALID_CHUNK;
+  size_t            index       = 0;
+  size_t            size        = 0;
+  size_t            total_size  = 0;
+  MemoryBlock*      left_buddy  = nullptr;
+  MemoryBlock*      right_buddy = nullptr;
+  size_t            guard_end   = 0;
+  // clang-format on
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1f07c5e789c42997b3c75167a26a5b09875bd498
--- /dev/null
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -0,0 +1,126 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+#include <stdlib.h>    // for malloc and free
+#include <sys/mman.h>  // for mlock and munlock
+#include <algorithm>   // for std::max
+
+#include "gflags/gflags.h"
+
+// If use_pinned_memory is true, CPUAllocator calls mlock, which
+// returns pinned and locked memory as staging areas for data exchange
+// between host and device.  Allocates too much would reduce the amount
+// of memory available to the system for paging.  So, by default, we
+// should set false to use_pinned_memory.
+DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+DECLARE_double(fraction_of_gpu_memory_to_use);
+namespace paddle {
+namespace memory {
+namespace detail {
+
+void* CPUAllocator::Alloc(size_t& index, size_t size) {
+  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
+  // malloc might not return nullptr if size is zero, but the returned
+  // pointer shall not be dereferenced -- so we make it nullptr.
+  if (size <= 0) return nullptr;
+
+  index = 0;  // unlock memory
+
+  void* p;
+
+#ifdef PADDLE_WITH_MKLDNN
+  // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
+  // memory alignment
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0);
+#else
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0);
+#endif
+  PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size);
+
+  if (p != nullptr) {
+    if (FLAGS_use_pinned_memory) {
+      index = 1;
+      mlock(p, size);  // lock memory
+    }
+  }
+
+  return p;
+}
+
+void CPUAllocator::Free(void* p, size_t size, size_t index) {
+  if (p != nullptr && index == 1) {
+    munlock(p, size);
+  }
+  free(p);
+}
+
+bool CPUAllocator::UseGpu() const { return false; }
+
+#ifdef PADDLE_WITH_CUDA
+
+void* GPUAllocator::Alloc(size_t& index, size_t size) {
+  // CUDA documentation doesn't explain if cudaMalloc returns nullptr
+  // if size is 0.  We just make sure it does.
+  if (size <= 0) return nullptr;
+  void* p;
+  cudaError_t result = cudaMalloc(&p, size);
+  if (result == cudaSuccess) {
+    index = 0;
+    gpu_alloc_size_ += size;
+    return p;
+  } else {
+    LOG(WARNING)
+        << "Cannot malloc " << size / 1024.0 / 1024.0
+        << " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use "
+           "environment variable to a lower value. Current value is "
+        << FLAGS_fraction_of_gpu_memory_to_use;
+    return nullptr;
+  }
+}
+
+void GPUAllocator::Free(void* p, size_t size, size_t index) {
+  cudaError_t err;
+
+  if (index == 0) {
+    PADDLE_ASSERT(gpu_alloc_size_ >= size);
+    gpu_alloc_size_ -= size;
+    err = cudaFree(p);
+  } else {
+    PADDLE_ASSERT(fallback_alloc_size_ >= size);
+    fallback_alloc_size_ -= size;
+    err = cudaFreeHost(p);
+  }
+
+  // Purposefully allow cudaErrorCudartUnloading, because
+  // that is returned if you ever call cudaFree after the
+  // driver has already shutdown. This happens only if the
+  // process is terminating, in which case we don't care if
+  // cudaFree succeeds.
+  if (err != cudaErrorCudartUnloading) {
+    PADDLE_ENFORCE(err, "cudaFree{Host} failed in GPUAllocator::Free.");
+  }
+}
+
+bool GPUAllocator::UseGpu() const { return true; }
+
+#endif
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h
similarity index 100%
rename from paddle/memory/detail/system_allocator.h
rename to paddle/fluid/memory/detail/system_allocator.h
diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a850e480ec948b727980a8020df91958584aea02
--- /dev/null
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/detail/system_allocator.h"
+
+#include <memory>
+#include <vector>
+
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+
+DECLARE_bool(use_pinned_memory);
+
+void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
+  bool freed = false;
+  {
+    size_t index;
+    void* p = a.Alloc(index, size);
+    if (size > 0) {
+      EXPECT_NE(p, nullptr);
+    } else {
+      EXPECT_EQ(p, nullptr);
+    }
+
+    int* i = static_cast<int*>(p);
+    std::shared_ptr<int> ptr(i, [&](void* p) {
+      freed = true;
+      a.Free(p, size, index);
+    });
+  }
+  EXPECT_TRUE(freed);
+}
+
+TEST(CPUAllocator, NoLockMem) {
+  FLAGS_use_pinned_memory = false;
+  paddle::memory::detail::CPUAllocator a;
+  TestAllocator(a, 2048);
+  TestAllocator(a, 0);
+}
+
+TEST(CPUAllocator, LockMem) {
+  FLAGS_use_pinned_memory = true;
+  paddle::memory::detail::CPUAllocator a;
+  TestAllocator(a, 2048);
+  TestAllocator(a, 0);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(GPUAllocator, Alloc) {
+  paddle::memory::detail::GPUAllocator a;
+  TestAllocator(a, 2048);
+  TestAllocator(a, 0);
+}
+#endif
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8938b3613373a06620a6a0237b3de773c6421edd
--- /dev/null
+++ b/paddle/fluid/memory/memcpy.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/memcpy.h"
+
+#include <cstring>  // for memcpy
+
+namespace paddle {
+namespace memory {
+
+template <>
+void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
+                                                  platform::CPUPlace,
+                                                  const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+#ifdef PADDLE_WITH_CUDA
+template <>
+void Copy<platform::CPUPlace, platform::CUDAPlace>(
+    platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
+    const void* src, size_t num, cudaStream_t stream) {
+  platform::SetDeviceId(src_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+}
+
+template <>
+void Copy<platform::CUDAPlace, platform::CPUPlace>(
+    platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
+    const void* src, size_t num, cudaStream_t stream) {
+  platform::SetDeviceId(dst_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+}
+
+template <>
+void Copy<platform::CUDAPlace, platform::CUDAPlace>(
+    platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
+    const void* src, size_t num, cudaStream_t stream) {
+  if (dst_place == src_place) {
+    platform::SetDeviceId(src_place.device);
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
+  } else {
+    platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
+                            stream);
+  }
+}
+
+#endif
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h
new file mode 100644
index 0000000000000000000000000000000000000000..77d209c3fbe8256bc94b3eca866f0f7e17a93325
--- /dev/null
+++ b/paddle/fluid/memory/memcpy.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+
+/**
+ * \brief   Copy memory from one place to another place.
+ *
+ * \param[in]  DstPlace Destination allocation place (CPU).
+ * \param[in]  dst      Destination memory address.
+ * \param[in]  SrcPlace Source allocation place (CPU).
+ * \param[in]  src      Source memory address.
+ * \param[in]  num      memory size in bytes to copy.
+ *
+ */
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
+
+#ifdef PADDLE_WITH_CUDA
+
+/**
+ * \brief   Copy memory from one place to another place.
+ *
+ * \param[in]  DstPlace Destination allocation place (CPU or GPU).
+ * \param[in]  dst      Destination memory address.
+ * \param[in]  SrcPlace Source allocation place (CPU or GPU).
+ * \param[in]  src      Source memory address.
+ * \param[in]  num      memory size in bytes to copy.
+ * \param[in]  stream   CUDA stream.
+ *
+ * \note    For GPU memory copy, CUDA stream need to be specified
+ *          for asynchronously memory copy.
+ *
+ */
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
+          cudaStream_t stream);
+
+#endif
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6eedab5d034192c071328b1be5c296227383287e
--- /dev/null
+++ b/paddle/fluid/memory/memory.cc
@@ -0,0 +1,134 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/memory.h"
+
+#include "glog/logging.h"
+
+#include "paddle/fluid/memory/detail/buddy_allocator.h"
+#include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+DECLARE_double(fraction_of_gpu_memory_to_use);
+
+namespace paddle {
+namespace memory {
+
+using BuddyAllocator = detail::BuddyAllocator;
+
+BuddyAllocator* GetCPUBuddyAllocator() {
+  static detail::BuddyAllocator* a = nullptr;
+  if (a == nullptr) {
+    a = new detail::BuddyAllocator(new detail::CPUAllocator,
+                                   platform::CpuMinChunkSize(),
+                                   platform::CpuMaxChunkSize());
+  }
+  return a;
+}
+
+template <>
+void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  void* p = GetCPUBuddyAllocator()->Alloc(size);
+  VLOG(10) << "  pointer=" << p;
+  return p;
+}
+
+template <>
+void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  GetCPUBuddyAllocator()->Free(p);
+}
+
+template <>
+size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
+  return GetCPUBuddyAllocator()->Used();
+}
+
+#ifdef PADDLE_WITH_CUDA
+
+BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static BuddyAllocator** as = NULL;
+  if (as == NULL) {
+    int gpu_num = platform::GetCUDADeviceCount();
+    as = new BuddyAllocator*[gpu_num];
+    for (int gpu = 0; gpu < gpu_num; gpu++) {
+      as[gpu] = nullptr;
+    }
+  }
+  platform::SetDeviceId(gpu_id);
+  if (!as[gpu_id]) {
+    as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator,
+                                    platform::GpuMinChunkSize(),
+                                    platform::GpuMaxChunkSize());
+    VLOG(10) << "\n\nNOTE: each GPU device use "
+             << FLAGS_fraction_of_gpu_memory_to_use * 100
+             << "% of GPU memory.\n"
+             << "You can set GFlags environment variable '"
+             << "FLAGS_fraction_of_gpu_memory_to_use"
+             << "' to change the fraction of GPU usage.\n\n";
+  }
+  return as[gpu_id];
+}
+
+template <>
+size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
+  return GetGPUBuddyAllocator(place.device)->Used();
+}
+
+template <>
+void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
+  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
+  auto* ptr = buddy_allocator->Alloc(size);
+  if (ptr == nullptr) {
+    int cur_dev = platform::GetCurrentDeviceId();
+    platform::SetDeviceId(place.device);
+    size_t avail, total;
+    platform::GpuMemoryUsage(avail, total);
+    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
+                 << place.device << ", available " << avail << " bytes";
+    LOG(WARNING) << "total " << total;
+    LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
+    LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
+    LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
+    platform::SetDeviceId(cur_dev);
+  }
+  return ptr;
+}
+
+template <>
+void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
+  GetGPUBuddyAllocator(place.device)->Free(p);
+}
+
+#endif
+
+size_t Usage::operator()(const platform::CPUPlace& cpu) const {
+  return Used(cpu);
+}
+
+size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
+#ifdef PADDLE_WITH_CUDA
+  return Used(gpu);
+#else
+  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
+}
+
+size_t memory_usage(const platform::Place& p) {
+  return boost::apply_visitor(Usage(), p);
+}
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/memory.h b/paddle/fluid/memory/memory.h
new file mode 100644
index 0000000000000000000000000000000000000000..a9166a6746e1985752ca18ffa7c429e5b35b55bb
--- /dev/null
+++ b/paddle/fluid/memory/memory.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+
+/**
+ * \brief   Allocate memory block in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ * \param[in]  size   Allocation size.
+ *
+ * \return  Allocated memory block address.
+ *
+ * \note    If return nullptr, it indicates memory allocation failed
+ *          because insufficient memory in current system. When Alloc
+ *          function is invoked, you must check the returned memory
+ *          address is valid or not.
+ */
+template <typename Place>
+void* Alloc(Place place, size_t size);
+
+/**
+ * \brief   Free memory block in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ * \param[in]  ptr    Memory block address to free.
+ *
+ */
+template <typename Place>
+void Free(Place place, void* ptr);
+
+/**
+ * \brief   Total size of used memory in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ *
+ */
+template <typename Place>
+size_t Used(Place place);
+
+struct Usage : public boost::static_visitor<size_t> {
+  size_t operator()(const platform::CPUPlace& cpu) const;
+  size_t operator()(const platform::CUDAPlace& gpu) const;
+};
+
+size_t memory_usage(const platform::Place& p);
+
+/**
+ * \brief   Free memory block in one place.
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *
+ */
+template <typename T, typename Place>
+class PODDeleter {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+
+ public:
+  explicit PODDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
+
+ private:
+  Place place_;
+};
+
+/**
+ * \brief   Free memory block in one place does not meet POD
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *
+ */
+template <typename T, typename Place>
+class PlainDeleter {
+ public:
+  explicit PlainDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
+
+ private:
+  Place place_;
+};
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/memory_test.cc b/paddle/fluid/memory/memory_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d7505ef0f36bc8765ba7634f286b67bccc6eacb6
--- /dev/null
+++ b/paddle/fluid/memory/memory_test.cc
@@ -0,0 +1,144 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/memory/detail/memory_block.h"
+#include "paddle/fluid/memory/detail/meta_data.h"
+
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
+
+#include <gtest/gtest.h>
+#include <unordered_map>
+
+inline bool is_aligned(void const *p) {
+  return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
+}
+
+size_t align(size_t size, paddle::platform::CPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::CpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, CPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::CPUPlace cpu;
+  p = paddle::memory::Alloc(cpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::platform::Place place = cpu;
+  EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
+
+  paddle::memory::Free(cpu, p);
+}
+
+TEST(BuddyAllocator, CPUMultAlloc) {
+  paddle::platform::CPUPlace cpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(cpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(cpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(size, cpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(cpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, cpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+}
+
+#ifdef PADDLE_WITH_CUDA
+
+size_t align(size_t size, paddle::platform::CUDAPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::GpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, GPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::CUDAPlace gpu(0);
+  p = paddle::memory::Alloc(gpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::platform::Place place = gpu;
+  EXPECT_EQ(paddle::memory::Used(gpu), paddle::memory::memory_usage(place));
+
+  paddle::memory::Free(gpu, p);
+}
+
+TEST(BuddyAllocator, GPUMultAlloc) {
+  paddle::platform::CUDAPlace gpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(gpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(gpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(size, gpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(gpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, gpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
+  }
+}
+
+#endif
diff --git a/paddle/fluid/operators/.clang-format b/paddle/fluid/operators/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..29282dc87e2c499988c17d90d47d44cd5cf7f115
--- /dev/null
+++ b/paddle/fluid/operators/.clang-format
@@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11 
+...
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cadfd735d7b3feb473c308b04417f0a1e0f22249
--- /dev/null
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -0,0 +1,203 @@
+file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
+set(DEPS_OPS "")
+set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h)
+file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
+function(op_library TARGET)
+    # op_library is a function to create op library. The interface is same as
+    # cc_library. But it handle split GPU/CPU code and link some common library
+    # for ops.
+    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
+    set(cc_srcs)
+    set(cu_srcs)
+    set(cu_cc_srcs)
+    set(op_common_deps operator op_registry math_function)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    set(pybind_flag 0)
+    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+
+    list(LENGTH op_library_SRCS op_library_SRCS_len)
+    if (${op_library_SRCS_len} EQUAL 0)
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+            list(APPEND cc_srcs ${TARGET}.cc)
+        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
+            list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
+        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
+            list(APPEND cu_srcs ${TARGET}.cu)
+        endif()
+    else()
+        foreach(src ${op_library_SRCS})
+            if (${src} MATCHES ".*\\.cu$")
+                list(APPEND cu_srcs ${src})
+            elseif(${src} MATCHES ".*\\.cu.cc$")
+                list(APPEND cu_cc_srcs ${src})
+            elseif(${src} MATCHES ".*\\.cc$")
+                list(APPEND cc_srcs ${src})
+            else()
+                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
+            endif()
+        endforeach()
+    endif()
+
+    list(LENGTH cc_srcs cc_srcs_len)
+    if (${cc_srcs_len} EQUAL 0)
+        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
+    endif()
+
+    list(LENGTH op_library_DEPS op_library_DEPS_len)
+    if (${op_library_DEPS_len} GREATER 0)
+        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
+    endif()
+    if (WITH_GPU)
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    else()
+        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    endif()
+
+    # Define operators that don't need pybind here.
+    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "create_reader_op")
+        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
+            set(pybind_flag 1)
+        endif()
+    endforeach()
+
+    # The registration of USE_OP, please refer to paddle/framework/op_registry.h.
+    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
+    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
+    file(READ ${TARGET}.cc TARGET_CONTENT)
+    string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}")
+    string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}")
+    if (one_register STREQUAL "")
+        string(REPLACE "_op" "" TARGET "${TARGET}")
+    else ()
+        string(REPLACE "REGISTER_OP(" "" TARGET "${one_register}")
+        string(REPLACE "," "" TARGET "${TARGET}")
+    endif()
+
+    # pybind USE_NO_KERNEL_OP
+    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
+    string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
+    string(REPLACE "_op" "" TARGET "${TARGET}")
+    if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
+        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
+        set(pybind_flag 1)
+    endif()
+
+    # pybind USE_CPU_ONLY_OP
+    list(LENGTH cu_srcs cu_srcs_len)
+    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
+    if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
+        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
+        set(pybind_flag 1)
+    endif()
+
+    # pybind USE_OP
+    if (${pybind_flag} EQUAL 0)
+        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
+    endif()
+endfunction()
+
+add_subdirectory(math)
+add_subdirectory(nccl)
+
+if(WITH_GPU)
+    op_library(nccl_op DEPS nccl_common)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
+else()
+    set(DEPS_OPS ${DEPS_OPS} nccl_op)
+endif()
+
+if(WITH_DISTRIBUTE)
+    add_subdirectory(detail)
+    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
+else()
+    set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op)
+endif()
+
+op_library(cond_op DEPS framework_proto tensor net_op)
+op_library(cross_entropy_op DEPS cross_entropy)
+op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
+op_library(softmax_op DEPS softmax)
+op_library(detection_output_op DEPS softmax)
+op_library(sequence_softmax_op DEPS softmax)
+op_library(sum_op DEPS selected_rows_functor)
+op_library(sgd_op DEPS selected_rows_functor)
+op_library(print_op DEPS lod_tensor)
+op_library(adagrad_op DEPS selected_rows_functor)
+op_library(maxout_op DEPS maxouting)
+op_library(unpool_op DEPS unpooling)
+op_library(pool_with_index_op DEPS pooling)
+op_library(lod_rank_table_op DEPS lod_rank_table)
+op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
+op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
+op_library(max_sequence_len_op DEPS lod_rank_table)
+op_library(sequence_conv_op DEPS context_project)
+op_library(sequence_pool_op DEPS sequence_pooling)
+op_library(lstm_op DEPS sequence2batch lstm_compute)
+op_library(lstmp_op DEPS sequence2batch lstm_compute)
+op_library(gru_op DEPS sequence2batch gru_compute)
+op_library(recurrent_op DEPS executor)
+op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function)
+op_library(cos_sim_op DEPS cos_sim_functor)
+op_library(parallel_do_op DEPS executor)
+op_library(create_reader_op DEPS reader)
+
+# Regist multiple Kernel to pybind
+if (WITH_GPU)
+
+op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS
+    vol2col depthwise_conv)
+
+op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
+op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
+op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
+  conv_transpose_cudnn_op.cu.cc DEPS vol2col)
+file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d, CUDNN);\n")
+file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(pool2d, CUDNN);\n")
+file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d_transpose, CUDNN);\n")
+else()
+op_library(conv_op SRCS conv_op.cc DEPS vol2col)
+op_library(pool_op SRCS pool_op.cc DEPS pooling)
+op_library(conv_transpose_op SRCS conv_transpose_op.cc DEPS vol2col)
+endif()
+
+# FIXME(typhoonzero): save/load depends lodtensor serialization functions
+op_library(save_op DEPS lod_tensor)
+op_library(load_op DEPS lod_tensor)
+op_library(save_combine_op DEPS lod_tensor)
+op_library(load_combine_op DEPS lod_tensor)
+
+list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
+foreach(src ${GENERAL_OPS})
+    op_library(${src})
+endforeach()
+file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(create_random_data_generator);\n")
+
+set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
+
+cc_test(gather_test SRCS gather_test.cc DEPS tensor)
+cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
+cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
+cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
+cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
+cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
+if(WITH_GPU)
+    cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+endif()
+cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
+cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
diff --git a/paddle/fluid/operators/accuracy_op.cc b/paddle/fluid/operators/accuracy_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..43689b3b7da5a0f5157ec8bc5fcf19d643ddc4ca
--- /dev/null
+++ b/paddle/fluid/operators/accuracy_op.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/accuracy_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AccuracyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Out"),
+                   "Input (Out) of accuracy op should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Indices"),
+                   "Input (Indices) of accuracy op should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"),
+                   "Input (Label) of accuracy op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Accuracy"),
+                   "Output (Accuracy) of AccuracyOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Correct"),
+                   "Output (Correct) of AccuracyOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Total"),
+                   "Output (Total) of AccuracyOp should not be null.");
+
+    auto inference_dim = ctx->GetInputDim("Out");
+    auto label_dim = ctx->GetInputDim("Label");
+    // Assume indices has same shape as inference, because
+    // it's the output of topk.
+
+    PADDLE_ENFORCE_EQ(label_dim.size(), 2, "label's rank must be 2.");
+    PADDLE_ENFORCE_EQ(label_dim[1], 1, "label's second dimension must be 1");
+    PADDLE_ENFORCE_EQ(inference_dim[0], label_dim[0],
+                      "the inference tensor's num_rows must be"
+                      " the same as label.");
+
+    ctx->SetOutputDim("Accuracy", {1});
+    ctx->SetOutputDim("Correct", {1});
+    ctx->SetOutputDim("Total", {1});
+    ctx->ShareLoD("Out", /*->*/ "Accuracy");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
+        ctx.GetPlace());
+  }
+};
+
+class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AccuracyOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    // TODO(typhoonzero): support both inference value and indices.
+    AddInput("Out", "The network output of topk (inferences)");
+    AddInput("Indices", "The the network output of topk (indices)");
+    AddInput("Label", "Label of the training data");
+    // TODO(typhoonzero): AddInput("Weight", ...
+    AddOutput("Accuracy", "The accuracy of current batch");
+    AddOutput("Correct", "The correct samples count of current batch");
+    AddOutput("Total", "The samples count of current batch");
+
+    AddComment(R"DOC(
+Accuracy Operator. 
+
+It will print accuracy rate for classification.
+The accuracy is calculated as follows:
+
+$$accuracy = \frac{NumOfCorrectPredicts}{NumOfAllSamples}$$
+
+Both the input Out and Label can carry the LoD (Level of Details)
+information, or not. But the output only shares the LoD information 
+with the input Out(Inference).
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+// FIXME(typhoonzero): types of T is for infernece data.
+// label data is always int.
+REGISTER_OP_CPU_KERNEL(accuracy,
+                       ops::AccuracyKernel<paddle::platform::CPUPlace, float>,
+                       ops::AccuracyKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/accuracy_op.cu b/paddle/fluid/operators/accuracy_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4462b9ba5c0e902933c53130f72fe40f807bde4a
--- /dev/null
+++ b/paddle/fluid/operators/accuracy_op.cu
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/execution_policy.h>
+#include <thrust/reduce.h>
+#include "paddle/fluid/operators/accuracy_op.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+template <int BlockSize>
+__global__ void AccuracyCudaKernel(const int N, const int D,
+                                   const int64_t* Xdata,
+                                   const int64_t* labeldata, int* correct_data,
+                                   float* accuracy, int* total_data) {
+  int count = 0;
+  __shared__ int total[BlockSize];
+
+  // support only 1 block
+  for (int i = threadIdx.x; i < (N); i += BlockSize) {
+    for (int j = 0; j < D; ++j) {
+      if (Xdata[i * D + j] == labeldata[i]) {
+        ++count;
+        break;
+      }
+    }
+  }
+  total[threadIdx.x] = count;
+  __syncthreads();
+
+  // reduce the count with init value 0, and output accuracy.
+  int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
+  if (threadIdx.x == 0) {
+    *correct_data = result;
+    *accuracy = static_cast<float>(result) / static_cast<float>(N);
+    *total_data = N;
+  }
+}
+
+template <typename T>
+class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* inference = ctx.Input<Tensor>("Out");
+    auto* indices = ctx.Input<Tensor>("Indices");
+    auto* label = ctx.Input<Tensor>("Label");
+
+    auto* accuracy = ctx.Output<Tensor>("Accuracy");
+    auto* correct = ctx.Output<Tensor>("Correct");
+    auto* total = ctx.Output<Tensor>("Total");
+    // FIXME(typhoonzero): only support indices currently
+    // if add support for output values, how to detect the data type?
+    const int64_t* indices_data = indices->data<int64_t>();
+    const int64_t* label_data = label->data<int64_t>();
+
+    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
+    int* total_data = total->mutable_data<int>(ctx.GetPlace());
+    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
+
+    int num_samples = static_cast<int>(inference->dims()[0]);
+    size_t infer_width = inference->dims()[1];
+    auto stream = ctx.cuda_device_context().stream();
+    platform::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream);
+
+    if (num_samples == 0) {
+      return;
+    }
+
+    AccuracyCudaKernel<
+        PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        num_samples, infer_width, indices_data, label_data, correct_data,
+        accuracy_data, total_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+// FIXME(typhoonzero): types of T is for inference data.
+// label data is always int64
+REGISTER_OP_CUDA_KERNEL(accuracy,
+                        paddle::operators::AccuracyOpCUDAKernel<float>,
+                        paddle::operators::AccuracyOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/accuracy_op.h b/paddle/fluid/operators/accuracy_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3ed1d3fe09ba044142ed69a463918d7e03a78e9
--- /dev/null
+++ b/paddle/fluid/operators/accuracy_op.h
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class AccuracyKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* inference = ctx.Input<Tensor>("Out");
+    auto* indices = ctx.Input<Tensor>("Indices");
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* accuracy = ctx.Output<Tensor>("Accuracy");
+    auto* correct = ctx.Output<Tensor>("Correct");
+    auto* total = ctx.Output<Tensor>("Total");
+
+    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
+    int* total_data = total->mutable_data<int>(ctx.GetPlace());
+    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
+
+    const int64_t* indices_data = indices->data<int64_t>();
+    const int64_t* label_data = label->data<int64_t>();
+
+    size_t num_samples = inference->dims()[0];
+    size_t class_dim = inference->dims()[1];
+    *accuracy_data = 0.0f;
+
+    if (num_samples == 0) {
+      return;
+    }
+
+    int num_correct = 0;
+    // assume inference is already the topk of the output
+    for (size_t i = 0; i < num_samples; ++i) {
+      PADDLE_ENFORCE_GE(label_data[i], 0, "label must >= 0");
+      for (size_t j = 0; j < class_dim; ++j) {
+        if (indices_data[i * class_dim + j] == label_data[i]) {
+          ++num_correct;
+          break;
+        }
+      }
+    }
+
+    *correct_data = num_correct;
+    *total_data = num_samples;
+    *accuracy_data =
+        static_cast<float>(num_correct) / static_cast<float>(num_samples);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c04dd8cb9163cf7b05fd09bcf7f1d2937368614f
--- /dev/null
+++ b/paddle/fluid/operators/activation_op.cc
@@ -0,0 +1,615 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/activation_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ActivationOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class ActivationOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
+  }
+};
+
+class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Sigmoid operator");
+    AddOutput("Out", "Output of Sigmoid operator");
+    AddComment(R"DOC(
+Sigmoid Activation Operator
+
+$$out = \frac{1}{1 + e^{-x}}$$
+
+)DOC");
+  }
+};
+
+class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LogSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of LogSigmoid operator");
+    AddOutput("Out", "Output of LogSigmoid operator");
+    AddComment(R"DOC(
+Logsigmoid Activation Operator
+
+$$out = \log \frac{1}{1 + e^{-x}}$$
+
+)DOC");
+  }
+};
+
+class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ExpOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Exp operator");
+    AddOutput("Out", "Output of Exp operator");
+    AddComment(R"DOC(
+Exp Activation Operator.
+
+$out = e^x$
+
+)DOC");
+  }
+};
+
+class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Relu operator");
+    AddOutput("Out", "Output of Relu operator");
+    AddComment(R"DOC(
+Relu Activation Operator.
+
+$out = \max(x, 0)$
+
+)DOC");
+  }
+};
+
+class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LeakyReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of LeakyRelu operator");
+    AddOutput("Out", "Output of LeakyRelu operator");
+    AddAttr<float>("alpha", "The small negative slope").SetDefault(0.02f);
+    AddComment(R"DOC(
+LeakyRelu Activation Operator.
+
+$out = \max(x, \alpha * x)$
+
+)DOC");
+  }
+};
+
+class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Softshrink operator");
+    AddOutput("Out", "Output of Softshrink operator");
+    AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
+    AddComment(R"DOC(
+Softshrink Activation Operator.
+
+$$
+out = \begin{cases} 
+    x - \lambda, \text{if } x > \lambda \\
+    x + \lambda, \text{if } x < -\lambda \\
+    0,  \text{otherwise}
+    \end{cases}
+$$
+
+)DOC");
+  }
+};
+
+class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  TanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Tanh operator");
+    AddOutput("Out", "Output of Tanh operator");
+    AddComment(R"DOC(
+Tanh Activation Operator.
+
+$$out = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+
+)DOC");
+  }
+};
+
+class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  TanhShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of TanhShrink operator");
+    AddOutput("Out", "Output of TanhShrink operator");
+    AddComment(R"DOC(
+TanhShrink Activation Operator.
+
+$$out = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+
+)DOC");
+  }
+};
+
+class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  HardShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of HardShrink operator");
+    AddOutput("Out", "Output of HardShrink operator");
+    AddAttr<float>("threshold", "The value of threshold for HardShrink")
+        .SetDefault(0.5f);
+    AddComment(R"DOC(
+HardShrink Activation Operator.
+
+$$
+out = \begin{cases} 
+    x, \text{if } x > \lambda \\
+    x, \text{if } x < -\lambda \\
+    0,  \text{otherwise}
+    \end{cases}
+$$
+
+)DOC");
+  }
+};
+
+class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SqrtOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Sqrt operator");
+    AddOutput("Out", "Output of Sqrt operator");
+    AddComment(R"DOC(
+Sqrt Activation Operator.
+
+$out = \sqrt{x}$
+
+)DOC");
+  }
+};
+
+class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AbsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Abs operator");
+    AddOutput("Out", "Output of Abs operator");
+    AddComment(R"DOC(
+Abs Activation Operator.
+
+$out = |x|$
+
+)DOC");
+  }
+};
+
+class CeilOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CeilOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Ceil operator");
+    AddOutput("Out", "Output of Ceil operator");
+    AddComment(R"DOC(
+Ceil Activation Operator.
+
+$out = ceil(x)$
+
+)DOC");
+  }
+};
+
+class FloorOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FloorOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Floor operator");
+    AddOutput("Out", "Output of Floor operator");
+    AddComment(R"DOC(
+Floor Activation Operator.
+
+$out = floor(x)$
+
+)DOC");
+  }
+};
+
+class RoundOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RoundOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Round operator");
+    AddOutput("Out", "Output of Round operator");
+    AddComment(R"DOC(
+Round Activation Operator.
+
+$out = [x]$
+
+)DOC");
+  }
+};
+
+class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReciprocalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Reciprocal operator");
+    AddOutput("Out", "Output of Reciprocal operator");
+    AddComment(R"DOC(
+Reciprocal Activation Operator.
+
+$$out = \frac{1}{x}$$
+
+)DOC");
+  }
+};
+
+class LogOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LogOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Log operator");
+    AddOutput("Out", "Output of Log operator");
+    AddComment(R"DOC(
+Log Activation Operator.
+
+$out = \ln(x)$
+
+Natural logarithm of x.
+
+)DOC");
+  }
+};
+
+class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SquareOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Square operator");
+    AddOutput("Out", "Output of Square operator");
+    AddComment(R"DOC(
+Square Activation Operator.
+
+$out = x^2$
+
+)DOC");
+  }
+};
+
+class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftplusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Softplus operator");
+    AddOutput("Out", "Output of Softplus operator");
+    AddComment(R"DOC(
+Softplus Activation Operator.
+
+$out = \ln(1 + e^{x})$
+
+)DOC");
+  }
+};
+
+class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftsignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Softsign operator");
+    AddOutput("Out", "Output of Softsign operator");
+    AddComment(R"DOC(
+Softsign Activation Operator.
+
+$$out = \frac{x}{1 + |x|}$$
+
+)DOC");
+  }
+};
+
+class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of BRelu operator");
+    AddOutput("Out", "Output of BRelu operator");
+    AddAttr<float>("t_min", "The min marginal value of BRelu")
+        .SetDefault(static_cast<float>(0));
+    AddAttr<float>("t_max", "The max marginal value of BRelu")
+        .SetDefault(static_cast<float>(24));
+    AddComment(R"DOC(
+BRelu Activation Operator.
+
+$out = \max(\min(x, t_{min}), t_{max})$
+
+)DOC");
+  }
+};
+
+class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of SoftRelu operator");
+    AddOutput("Out", "Output of SoftRelu operator");
+    AddAttr<float>("threshold", "The threshold value of SoftRelu")
+        .SetDefault(40.0f);
+    AddComment(R"DOC(
+SoftRelu Activation Operator.
+
+$out = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
+
+)DOC");
+  }
+};
+
+class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ELUOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of ELU operator");
+    AddOutput("Out", "Output of ELU operator");
+    AddAttr<float>("alpha", "The alpha value of ELU").SetDefault(1.0f);
+    AddComment(R"DOC(
+ELU Activation Operator.
+
+Applies the following element-wise computation on the input according to
+https://arxiv.org/abs/1511.07289.
+
+$out = \max(0, x) + \min(0, \alpha * (e^x - 1))$
+
+)DOC");
+  }
+};
+
+class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Relu6OpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Relu6 operator");
+    AddOutput("Out", "Output of Relu6 operator");
+    AddAttr<float>("threshold", "The threshold value of Relu6")
+        .SetDefault(6.0f);
+    AddComment(R"DOC(
+Relu6 Activation Operator.
+
+$out = \min(\max(0, x), 6)$
+
+)DOC");
+  }
+};
+
+class PowOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PowOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Pow operator");
+    AddOutput("Out", "Output of Pow operator");
+    AddAttr<float>("factor", "The exponential factor of Pow").SetDefault(1.0f);
+    AddComment(R"DOC(
+Pow Activation Operator.
+
+$out = x^{factor}$
+
+)DOC");
+  }
+};
+
+class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  STanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of STanh operator");
+    AddOutput("Out", "Output of STanh operator");
+    AddAttr<float>("scale_a", "The scale parameter of a for the input")
+        .SetDefault(2.0f / 3.0f);
+    AddAttr<float>("scale_b", "The scale parameter of b for the input")
+        .SetDefault(1.7159f);
+    AddComment(R"DOC(
+STanh Activation Operator.
+
+$$out = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
+
+)DOC");
+  }
+};
+
+class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ThresholdedReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of ThresholdedRelu operator");
+    AddOutput("Out", "Output of ThresholdedRelu operator");
+    AddAttr<float>("threshold", "The threshold location of activation")
+        .SetDefault(1.0f);
+    AddComment(R"DOC(
+ThresholdedRelu Activation Operator.
+
+$$
+out = \begin{cases} 
+    x, \text{if } x > threshold \\
+    0,  \text{otherwise}
+    \end{cases}
+$$
+
+)DOC");
+  }
+};
+
+class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  HardSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of HardSigmoid operator");
+    AddOutput("Out", "Output of HardSigmoid operator");
+    AddAttr<float>("slope", "Slope for linear approximation of sigmoid")
+        .SetDefault(0.2f);
+    AddAttr<float>("offset", "Offset for linear approximation of sigmoid")
+        .SetDefault(0.5f);
+    AddComment(R"DOC(
+HardSigmoid Activation Operator.
+
+Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), 
+which is much faster than sigmoid.
+
+$out = \max(0, \min(1, slope * x + shift))$
+
+The slope should be positive. The offset can be either positive or negative.
+The default slope and shift are set according to the above reference.
+It is recommended to use the defaults for this activation.
+
+)DOC");
+  }
+};
+
+class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SwishOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Swish operator");
+    AddOutput("Out", "Output of Swish operator");
+    AddAttr<float>("beta", "Constant beta of swish operator").SetDefault(1.0f);
+    AddComment(R"DOC(
+Swish Activation Operator.
+
+$$out = \frac{x}{1 + e^{- \beta x}}$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, sigmoid_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker,
+            logsigmoid_grad, ops::ActivationOpGrad);
+
+REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
+            tanh_shrink_grad, ops::ActivationOpGrad);
+
+REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker,
+            softshrink_grad, ops::ActivationOpGrad);
+
+REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(floor, ops::ActivationOp, ops::FloorOpMaker, floor_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(round, ops::ActivationOp, ops::RoundOpMaker, round_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker,
+            reciprocal_grad, ops::ActivationOpGrad);
+
+REGISTER_OP(log, ops::ActivationOp, ops::LogOpMaker, log_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(square, ops::ActivationOp, ops::SquareOpMaker, square_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(softplus, ops::ActivationOp, ops::SoftplusOpMaker, softplus_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(softsign, ops::ActivationOp, ops::SoftsignOpMaker, softsign_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker, brelu_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker,
+            leaky_relu_grad, ops::ActivationOpGrad);
+
+REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker, soft_relu_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(elu, ops::ActivationOp, ops::ELUOpMaker, elu_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(relu6, ops::ActivationOp, ops::Relu6OpMaker, relu6_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker, pow_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker, stanh_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker,
+            hard_shrink_grad, ops::ActivationOpGrad);
+
+REGISTER_OP(thresholded_relu, ops::ActivationOp, ops::ThresholdedReluOpMaker,
+            thresholded_relu_grad, ops::ActivationOpGrad);
+
+REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker,
+            hard_sigmoid_grad, ops::ActivationOpGrad);
+
+REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
+            ops::ActivationOpGrad);
+
+#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)   \
+  REGISTER_OP_CPU_KERNEL(                                                 \
+      act_type, ops::ActivationKernel<paddle::platform::CPUDeviceContext, \
+                                      ops::functor<float>>,               \
+      ops::ActivationKernel<paddle::platform::CPUDeviceContext,           \
+                            ops::functor<double>>);                       \
+  REGISTER_OP_CPU_KERNEL(                                                 \
+      act_type##_grad,                                                    \
+      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
+                                ops::grad_functor<float>>,                \
+      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
+                                ops::grad_functor<double>>);
+
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b86a7926a978b987d9dbb51fba55e025aab5e7fd
--- /dev/null
+++ b/paddle/fluid/operators/activation_op.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/activation_op.h"
+
+namespace ops = paddle::operators;
+
+#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor)   \
+  REGISTER_OP_CUDA_KERNEL(                                                 \
+      act_type, ops::ActivationKernel<paddle::platform::CUDADeviceContext, \
+                                      ops::functor<float>>,                \
+      ops::ActivationKernel<paddle::platform::CUDADeviceContext,           \
+                            ops::functor<double>>);                        \
+  REGISTER_OP_CUDA_KERNEL(                                                 \
+      act_type##_grad,                                                     \
+      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
+                                ops::grad_functor<float>>,                 \
+      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
+                                ops::grad_functor<double>>);
+
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a6ae2224c84cc17223f43046f06f08d11451439
--- /dev/null
+++ b/paddle/fluid/operators/activation_op.h
@@ -0,0 +1,799 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename Functor>
+class ActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto& X = detail::Ref(context.Input<framework::Tensor>("X"),
+                          "Cannot get input tensor X, variable name = %s",
+                          context.op().Input("X"));
+
+    auto& Out = detail::Ref(context.Output<framework::Tensor>("Out"),
+                            "Cannot get output tensor Out, variable name = %s",
+                            context.op().Output("Out"));
+    Out.mutable_data<T>(context.GetPlace());
+    auto x = framework::EigenVector<T>::Flatten(X);
+    auto out = framework::EigenVector<T>::Flatten(Out);
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(*place, x, out);
+  }
+};
+
+template <typename DeviceContext, typename Functor>
+class ActivationGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Out = context.Input<framework::Tensor>("Out");
+    auto* dOut =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    dX->mutable_data<T>(context.GetPlace());
+
+    auto dout = framework::EigenVector<T>::Flatten(*dOut);
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+    Functor functor;
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(*place, x, out, dout, dx);
+  }
+};
+
+template <typename T>
+struct BaseActivationFunctor {
+  using ELEMENT_TYPE = T;
+
+  using AttrPair = std::vector<std::pair<const char*, float*>>;
+
+  AttrPair GetAttrs() { return AttrPair(); }
+};
+
+// sigmoid(x) = 1 / (1 + exp(-x))
+template <typename T>
+struct SigmoidFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+  }
+};
+
+template <typename T>
+struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * out * (static_cast<T>(1) - out);
+  }
+};
+
+// Originally: logsigmoid(x) = -log (1 + exp(-x))
+// For numerical stability, we can use the log-sum-exp trick:
+// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
+// We can rewrite the above equation as:
+// out = -log( exp(0) + exp(-x)) [since exp(0) = 1]
+//   = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
+//   = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
+//           max(-x, 0)))
+//   = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
+//   = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))
+//
+// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0))
+// + exp(-x - max(-x, 0))))
+template <typename T>
+struct LogSigmoidFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
+    out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log());
+  }
+};
+
+// Originally: f' = exp(-x) / (1 + exp(-x))
+// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) +
+// exp(-x - max(-x, 0)))
+template <typename T>
+struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
+    dx.device(d) =
+        dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
+  }
+};
+
+// exp(x) = e^x
+template <typename T>
+struct ExpFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.exp();
+  }
+};
+
+template <typename T>
+struct ExpGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * out;
+  }
+};
+
+// relu(x) = max(x, 0)
+template <typename T>
+struct ReluFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(0));
+  }
+};
+
+template <typename T>
+struct ReluGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>();
+  }
+};
+
+// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+template <typename T>
+struct TanhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.tanh();
+  }
+};
+
+template <typename T>
+struct TanhGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<T>(1) - out * out);
+  }
+};
+
+// tanhshrink(x) = x - tanh(x)
+// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+template <typename T>
+struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x - x.tanh();
+  }
+};
+
+template <typename T>
+struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (x.tanh() * x.tanh());
+  }
+};
+
+// tanhshrink(x) = x - tanh(x)
+// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+template <typename T>
+struct HardShrinkFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>().eval();
+    auto temp2 = (x > static_cast<T>(threshold)).template cast<T>().eval();
+    out.device(d) = x * (temp1 + temp2);
+  }
+};
+
+template <typename T>
+struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>().eval();
+    auto temp2 = (x > static_cast<T>(threshold)).template cast<T>().eval();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
+  }
+};
+
+// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
+// otherwise
+template <typename T>
+struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
+  float lambda;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto lambdaT = static_cast<T>(lambda);
+    auto temp1 = (x > lambdaT).template cast<T>().eval();
+    auto temp2 = (x < -lambdaT).template cast<T>().eval();
+    out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
+  }
+};
+
+template <typename T>
+struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
+  float lambda;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto lambdaT = static_cast<T>(lambda);
+    auto temp1 = (x > lambdaT).template cast<T>().eval();
+    auto temp2 = (x < -lambdaT).template cast<T>().eval();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
+  }
+};
+
+// sqrt(x) = x^(1/2)
+template <typename T>
+struct SqrtFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.sqrt();
+  }
+};
+
+template <typename T>
+struct SqrtGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    const Out out_conj = Eigen::numext::conj(out);
+    dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
+  }
+};
+
+// ceil(x) = ceiling(x)
+template <typename T>
+struct CeilFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.ceil();
+  }
+};
+
+template <typename T>
+struct ZeroGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = static_cast<T>(0) / x;
+  }
+};
+
+// floor(x) = flooring(x)
+template <typename T>
+struct FloorFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.floor();
+  }
+};
+
+// round(x) = [x]
+template <typename T>
+struct RoundFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.round();
+  }
+};
+
+// abs(x) = |x|
+template <typename T>
+struct AbsFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.abs();
+  }
+};
+
+template <typename T>
+struct AbsGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.sign();
+  }
+};
+
+// reciprocal(x) = 1 / x
+template <typename T>
+struct ReciprocalFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = static_cast<T>(1) / x;
+  }
+};
+
+template <typename T>
+struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(-1) * out * out;
+  }
+};
+
+// log(x) = natural logarithm of x
+template <typename T>
+struct LogFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.log();
+  }
+};
+
+template <typename T>
+struct LogGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<T>(1) / x);
+  }
+};
+
+// square(x) = x^2
+template <typename T>
+struct SquareFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.square();
+  }
+};
+
+template <typename T>
+struct SquareGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(2) * x;
+  }
+};
+
+template <typename T>
+struct BReluFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
+
+  // NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
+  // not polymorphism for speed.
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
+        x.cwiseMax(static_cast<T>(t_min)).cwiseMin(static_cast<T>(t_max));
+  }
+};
+
+template <typename T>
+struct BReluGradFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
+                   ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
+                       .template cast<T>();
+  }
+};
+
+// relu6(x) = min(max(0, x), 6)
+template <typename T>
+struct Relu6Functor : public BaseActivationFunctor<T> {
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
+        x.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(threshold));
+  }
+};
+
+template <typename T>
+struct Relu6GradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
+                   ((x > static_cast<T>(0)) * (x < static_cast<T>(threshold)))
+                       .template cast<T>();
+  }
+};
+
+// softplus(x) = log(1 + exp(x))
+// When x is a very large positive number, exp(x) may explode to inf,
+// Using trick below for numerical stability
+// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
+// Then: softplus(x) = max(x, 0) + log(exp(-max(x, 0)) + exp(x - max(x, 0)))
+template <typename T>
+struct SoftplusFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) {
+    auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
+    out.device(d) = temp + (((-temp).exp() + (x - temp).exp()).log());
+  }
+};
+
+// d(softplus(x))/dx = exp(x) / (1 + exp(x))
+// For numerical stability:
+// d(softplus(x))/dx = exp(x - max(x, 0)) / (exp(-max(x, 0)) +
+// exp(x - max(x, 0)))
+template <typename T>
+struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) {
+    auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
+    dx.device(d) =
+        dout * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp()));
+  }
+};
+
+// softsign(x) = x / (1 + |x|)
+template <typename T>
+struct SoftsignFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) {
+    out.device(d) = x / (static_cast<T>(1) + x.abs());
+  }
+};
+
+// d(softsign(x))/dx = 1 / (1 + |x|)^2
+// Taken from https://en.wikipedia.org/wiki/Activation_function
+template <typename T>
+struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) {
+    dx.device(d) =
+        dout * (static_cast<T>(1) / (static_cast<T>(1) + x.abs()).square());
+  }
+};
+
+template <typename T>
+struct SoftReluFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto tmp = static_cast<T>(threshold);
+    auto temp = x.cwiseMax(-tmp).cwiseMin(tmp);
+    out.device(d) = (static_cast<T>(1) + temp.exp()).log();
+  }
+};
+
+template <typename T>
+struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto tmp = static_cast<T>(threshold);
+    auto temp = ((x > -tmp) * (x < tmp)).template cast<T>().eval();
+    dx.device(d) = dout * (static_cast<T>(1) - (-out).exp()) * temp;
+  }
+};
+
+template <typename T>
+struct LeakyReluFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
+  }
+};
+
+template <typename T>
+struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 = static_cast<T>(alpha) *
+                 (x < static_cast<T>(0)).template cast<T>().eval();
+    auto temp2 = (x >= static_cast<T>(0)).template cast<T>().eval();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
+  }
+};
+
+template <typename T>
+struct ELUFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(0)) +
+                    (static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)))
+                        .cwiseMin(static_cast<T>(0));
+  }
+};
+
+template <typename T>
+struct ELUGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>() +
+                   dout * (out + static_cast<T>(alpha)) *
+                       (x < static_cast<T>(0)).template cast<T>();
+  }
+};
+
+// FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198
+template <typename T>
+struct PowFunctor : public BaseActivationFunctor<T> {
+  float factor;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"factor", &factor}};
+  }
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.pow(static_cast<T>(factor));
+  }
+};
+
+template <typename T>
+struct PowGradFunctor : public BaseActivationFunctor<T> {
+  float factor;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"factor", &factor}};
+  }
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(factor) *
+                   x.pow(static_cast<T>(factor - static_cast<T>(1)));
+  }
+};
+
+template <typename T>
+struct STanhFunctor : public BaseActivationFunctor<T> {
+  float scale_a;
+  float scale_b;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
+        static_cast<T>(scale_b) * (static_cast<T>(scale_a) * x).tanh();
+  }
+};
+
+template <typename T>
+struct STanhGradFunctor : public BaseActivationFunctor<T> {
+  float scale_a;
+  float scale_b;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto a = static_cast<T>(scale_a);
+    auto b = static_cast<T>(scale_b);
+    auto temp = (a * x).tanh() * (a * x).tanh();
+    dx.device(d) = dout * a * b * (static_cast<T>(1) - temp);
+  }
+};
+
+template <typename T>
+struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto th = static_cast<T>(threshold);
+    out.device(d) = (x > th).template cast<T>() * x;
+  }
+};
+
+template <typename T>
+struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto th = static_cast<T>(threshold);
+    dx.device(d) = dout * (x > th).template cast<T>();
+  }
+};
+
+template <typename T>
+struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = x * static_cast<T>(slope) + static_cast<T>(offset);
+    out.device(d) =
+        temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
+  }
+};
+
+template <typename T>
+struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
+                   ((out > static_cast<T>(0)) * (out < static_cast<T>(1)))
+                       .template cast<T>() *
+                   static_cast<T>(slope);
+  }
+};
+
+template <typename T>
+struct SwishFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x / (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+  }
+};
+
+template <typename T>
+struct SwishGradFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}};
+  }
+
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 = static_cast<T>(1) /
+                 (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+    auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
+    dx.device(d) = dout * ((beta * out) + temp2);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define FOR_EACH_KERNEL_FUNCTOR(__macro)                             \
+  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
+  __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);     \
+  __macro(exp, ExpFunctor, ExpGradFunctor);                          \
+  __macro(relu, ReluFunctor, ReluGradFunctor);                       \
+  __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
+  __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
+  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
+  __macro(abs, AbsFunctor, AbsGradFunctor);                          \
+  __macro(ceil, CeilFunctor, ZeroGradFunctor);                       \
+  __macro(floor, FloorFunctor, ZeroGradFunctor);                     \
+  __macro(round, RoundFunctor, ZeroGradFunctor);                     \
+  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);     \
+  __macro(log, LogFunctor, LogGradFunctor);                          \
+  __macro(square, SquareFunctor, SquareGradFunctor);                 \
+  __macro(brelu, BReluFunctor, BReluGradFunctor);                    \
+  __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor);          \
+  __macro(pow, PowFunctor, PowGradFunctor);                          \
+  __macro(stanh, STanhFunctor, STanhGradFunctor);                    \
+  __macro(softplus, SoftplusFunctor, SoftplusGradFunctor);           \
+  __macro(softsign, SoftsignFunctor, SoftsignGradFunctor);           \
+  __macro(relu6, Relu6Functor, Relu6GradFunctor);                    \
+  __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);       \
+  __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor);    \
+  __macro(elu, ELUFunctor, ELUGradFunctor);                          \
+  __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor);    \
+  __macro(hard_sigmoid, HardSigmoidFunctor, HardSigmoidGradFunctor); \
+  __macro(swish, SwishFunctor, SwishGradFunctor);                    \
+  __macro(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor);
diff --git a/paddle/fluid/operators/adadelta_op.cc b/paddle/fluid/operators/adadelta_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ececd47e6a6787a161405fec75dafda336fddfbf
--- /dev/null
+++ b/paddle/fluid/operators/adadelta_op.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/adadelta_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AdadeltaOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredGrad"),
+                   "Input(AvgSquaredGrad) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"),
+                   "Input(AvgSquaredUpdate) of AdadeltaOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("AvgSquaredGradOut"),
+        "Output(AvgSquaredGradOut) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("AvgSquaredUpdateOut"),
+        "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("Grad"),
+        "param and grad input of AdadeltaOp should have same dimension");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"),
+                      "Param and AvgSquaredGrad input of AdadeltaOp "
+                      "should have same dimension");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"),
+                      "Param and AvgSquaredUpdate input of AdadeltaOp "
+                      "should have same dimension");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
+    ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim);
+  }
+};
+
+class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AdadeltaOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient");
+    AddInput("AvgSquaredUpdate",
+             "(Tensor) Input average of squared parameter updates");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("AvgSquaredGradOut",
+              "(Tensor) Output average of squared gradient");
+    AddOutput("AvgSquaredUpdateOut",
+              "(Tensor) Output average of squared parameter updates");
+
+    AddAttr<float>("rho",
+                   "(float, default 0.95) Exponential decay rate "
+                   "for squared gradients.")
+        .SetDefault(0.95f);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-6) Constant for "
+                   "numerical stability")
+        .SetDefault(1.0e-6f);
+    AddComment(R"DOC(
+Adadelta Optimizer.
+
+Adadelta optimizer is implemented as explained in:
+https://arxiv.org/abs/1212.5701
+Adadelta is a per-dimension adaptive learning rate method used
+for gradient descent.
+
+Adadelta updates are as follows:
+
+$$
+avg\_squared\_grad\_out = \rho * avg\_squared\_grad + (1 - \rho) * grad * grad \\
+param\_update =  - \sqrt{\frac{avg\_squared\_update + \epsilon}{avg\_squared\_grad\_out + \epsilon}} * grad \\
+avg\_squared\_update\_out = \rho * avg\_squared\_update + (1 - \rho) * {param\_update}^2 \\
+param\_out = param + param\_update
+$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/adadelta_op.cu b/paddle/fluid/operators/adadelta_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..733482f788df8dfe1224ebe0d4494111bf9f647b
--- /dev/null
+++ b/paddle/fluid/operators/adadelta_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/adadelta_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/adadelta_op.h b/paddle/fluid/operators/adadelta_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..82ced08710448293fb91a4fb0dea7ab216cd3da6
--- /dev/null
+++ b/paddle/fluid/operators/adadelta_op.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class AdadeltaOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto avg_squared_grad_out_tensor =
+        ctx.Output<framework::Tensor>("AvgSquaredGradOut");
+    auto avg_squared_update_out_tensor =
+        ctx.Output<framework::Tensor>("AvgSquaredUpdateOut");
+
+    param_out_tensor->mutable_data<T>(ctx.GetPlace());
+    avg_squared_grad_out_tensor->mutable_data<T>(ctx.GetPlace());
+    avg_squared_update_out_tensor->mutable_data<T>(ctx.GetPlace());
+
+    T rho = static_cast<T>(ctx.Attr<float>("rho"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+
+    auto param = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Param"));
+    auto grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Grad"));
+    // Squared gradient accumulator
+    auto avg_squared_grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("AvgSquaredGrad"));
+    // Squared updates accumulator
+    auto avg_squared_update = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("AvgSquaredUpdate"));
+    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+    auto avg_squared_grad_out =
+        framework::EigenVector<T>::Flatten(*avg_squared_grad_out_tensor);
+    auto avg_squared_update_out =
+        framework::EigenVector<T>::Flatten(*avg_squared_update_out_tensor);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    avg_squared_grad_out.device(place) =
+        rho * avg_squared_grad + (1 - rho) * grad.square();
+    auto update =
+        -((avg_squared_update + epsilon) / (avg_squared_grad_out + epsilon))
+             .sqrt() *
+        grad;
+    avg_squared_update_out.device(place) =
+        rho * avg_squared_update + (1 - rho) * update.square();
+    param_out.device(place) = param + update;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/adagrad_op.cc b/paddle/fluid/operators/adagrad_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..61c0ecd019b1d7811ee5cfd4b43358bdb0fba3d9
--- /dev/null
+++ b/paddle/fluid/operators/adagrad_op.cc
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/adagrad_op.h"
+
+#include <cmath>
+
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+
+class AdagradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of AdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of AdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment"),
+                   "Input(Moment) of AdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of AdagradOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of AdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
+                   "Output(MomentOut) of AdagradOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "LearningRate should have one element");
+    auto param_dims = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Grad"),
+        "Param and Grad input of AdagradOp should have the same dimension.");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment"),
+        "Param and Moment input of AdagradOp should have the same dimension.");
+
+    ctx->SetOutputDim("ParamOut", param_dims);
+    ctx->SetOutputDim("MomentOut", param_dims);
+  }
+};
+
+class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AdagradOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("Moment", "(Tensor) Second moment");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("MomentOut", "(Tensor) Output second moment");
+
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-6) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-6f);
+    AddComment(R"DOC(
+
+Adaptive Gradient Algorithm (Adagrad).
+
+The update is done as follows:
+
+$$moment\_out = moment + grad * grad \\
+param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
+$$
+
+The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+does not have the epsilon attribute. It is added here in our implementation
+as also proposed here: http://cs231n.github.io/neural-networks-3/#ada
+for numerical stability to avoid the division by zero error.
+
+)DOC");
+  }
+};
+
+namespace {
+size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
+  return std::find(rows.begin(), rows.end(), value) - rows.begin();
+}
+}  // namespace
+
+template <typename T>
+struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::SelectedRows& grad,
+                  const framework::Tensor& learning_rate, T epsilon,
+                  framework::Tensor* moment, framework::Tensor* param) {
+    // 1. g_m.rows = set(g.rows)
+    auto grad_width = grad.value().dims()[1];
+    math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
+    auto grad_merge = merge_func(context, grad);
+    auto& merge_rows = grad_merge.rows();
+    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
+
+    // 2. m += g_m * g_m
+    math::scatter::Mul<platform::CPUDeviceContext, T> sqare_func;
+    auto grad_square = sqare_func(context, grad_merge, grad_merge);
+
+    math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
+    functor(context, grad_square, moment);
+
+    // 3. update parameter
+    auto* lr = learning_rate.data<T>();
+    auto* param_data = param->data<T>();
+    auto* moment_data = moment->data<T>();
+
+    for (size_t i = 0; i < merge_rows.size(); i++) {
+      for (int64_t j = 0; j < grad_width; j++) {
+        param_data[merge_rows[i] * grad_width + j] -=
+            lr[0] * grad_merge_data[i * grad_width + j] /
+            (std::sqrt(moment_data[merge_rows[i] * grad_width + j]) + epsilon);
+      }
+    }
+  }
+};
+
+template struct SparseAdagradFunctor<platform::CPUDeviceContext, float>;
+template struct SparseAdagradFunctor<platform::CPUDeviceContext, double>;
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    adagrad, ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/adagrad_op.cu b/paddle/fluid/operators/adagrad_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1117363c133fe02c0c6b0a563d0b3665efb7fb18
--- /dev/null
+++ b/paddle/fluid/operators/adagrad_op.cu
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/adagrad_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+namespace {
+
+template <typename T, int block_size>
+__global__ void MergeGradKernel(const T* grad, const int64_t* grad_rows,
+                                T* grad_merge, const int64_t* grad_merge_rows,
+                                size_t grad_merge_rows_size,
+                                int64_t row_numel) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+  __shared__ size_t grad_merge_idx;
+
+  if (tid == 0) {
+    for (size_t i = 0; i < grad_merge_rows_size; i++) {
+      if (grad_rows[ty] == grad_merge_rows[i]) {
+        grad_merge_idx = i;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  grad += ty * row_numel;
+  grad_merge += grad_merge_idx * row_numel;
+  for (int index = tid; index < row_numel; index += block_size) {
+    paddle::platform::CudaAtomicAdd(grad_merge + index, grad[index]);
+  }
+}
+
+template <typename T, int block_size>
+__global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows,
+                                           const T* learning_rate, T* param,
+                                           T* moment, int64_t row_numel,
+                                           T epsilon) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  grad += ty * row_numel;
+  param += rows[ty] * row_numel;
+  moment += rows[ty] * row_numel;
+
+  for (int index = tid; index < row_numel; index += block_size) {
+    // Since index in rows of SelectedRows can be duplicate, we have to use
+    // Atomic Operation to avoid concurrent write error.
+    paddle::platform::CudaAtomicAdd(param + index,
+                                    -1.0 * learning_rate[0] * grad[index] /
+                                        (sqrt(moment[index]) + epsilon));
+  }
+}
+}  // namespace
+
+template <typename T>
+struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::SelectedRows& grad,
+                  const framework::Tensor& learning_rate, T epsilon,
+                  framework::Tensor* moment, framework::Tensor* param) {
+    // 1. g_m.rows = set(g.rows)
+    auto grad_width = grad.value().dims()[1];
+    math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func;
+    auto grad_merge = merge_func(context, grad);
+    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
+    framework::Vector<int64_t> merge_rows(grad_merge.rows());
+    // 2. m += g_m * g_m
+    math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func;
+    auto grad_square = sqare_func(context, grad_merge, grad_merge);
+
+    math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
+    functor(context, grad_square, moment);
+
+    // 3. update parameter
+    auto* lr = learning_rate.data<T>();
+    auto* param_data = param->data<T>();
+    auto* moment_data = moment->data<T>();
+
+    const int block_size = 256;
+    dim3 threads(block_size, 1);
+    dim3 grid2(1, merge_rows.size());
+    SparseAdagradFunctorKernel<
+        T, 256><<<grid2, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(
+        grad_merge_data, merge_rows.CUDAMutableData(context.GetPlace()), lr,
+        param_data, moment_data, grad_width, epsilon);
+  }
+};
+
+template struct SparseAdagradFunctor<platform::CUDADeviceContext, float>;
+template struct SparseAdagradFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    adagrad, ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/adagrad_op.h b/paddle/fluid/operators/adagrad_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee503b2c36299c7550f6679fe6e4bca7c33c8eee
--- /dev/null
+++ b/paddle/fluid/operators/adagrad_op.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+struct SparseAdagradFunctor {
+  void operator()(const DeviceContext& context,
+                  const framework::SelectedRows& grad,
+                  const framework::Tensor& learning_rate, T epsilon,
+                  framework::Tensor* moment, framework::Tensor* param);
+};
+
+template <typename DeviceContext, typename T>
+class AdagradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto* moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
+
+    param_out_tensor->mutable_data<T>(ctx.GetPlace());
+    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
+
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+
+    auto* grad_var = ctx.InputVar("Grad");
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      auto param = framework::EigenVector<T>::Flatten(
+          *ctx.Input<framework::Tensor>("Param"));
+      auto grad = framework::EigenVector<T>::Flatten(
+          *ctx.Input<framework::Tensor>("Grad"));
+      auto moment = framework::EigenVector<T>::Flatten(
+          *ctx.Input<framework::Tensor>("Moment"));
+      auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+      auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+      auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
+      auto* place = ctx.template device_context<DeviceContext>().eigen_device();
+
+      moment_out.device(*place) = moment + grad * grad;
+      Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
+      if (platform::is_cpu_place(ctx.GetPlace())) {
+        auto* lr = learning_rate->data<T>();
+        param_out.device(*place) =
+            param - lr[0] * grad / (moment_out.sqrt() + epsilon);
+      } else {
+        auto lr = framework::EigenVector<T>::Flatten(*learning_rate);
+        param_out.device(*place) =
+            param -
+            lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
+      }
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      auto* param_tensor = ctx.Input<framework::Tensor>("Param");
+      PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor);
+
+      auto* moment_tensor = ctx.Input<framework::Tensor>("Moment");
+      PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor);
+
+      SparseAdagradFunctor<DeviceContext, T> functor;
+      functor(ctx.template device_context<DeviceContext>(),
+              *ctx.Input<framework::SelectedRows>("Grad"),
+              *ctx.Input<framework::Tensor>("LearningRate"), epsilon,
+              moment_out_tensor, param_out_tensor);
+    } else {
+      PADDLE_THROW("Unsupported Variable Type of Grad");
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/adam_op.cc b/paddle/fluid/operators/adam_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..25da9336b28ca8d14bf01ce8ca13bd8b379e9b10
--- /dev/null
+++ b/paddle/fluid/operators/adam_op.cc
@@ -0,0 +1,133 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/adam_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AdamOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment1"),
+                   "Input(Moment1) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment2"),
+                   "Input(Moment2) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
+                   "Input(Beta1Pow) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"),
+                   "Input(Beta2Pow) of AdamOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"),
+                   "Output(Moment1Out) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
+                   "Output(Moment2Out) of AdamOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 dimension");
+    auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
+    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
+                      "Beta1 power accumulator should have 1 dimension");
+    auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
+    PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
+                      "Beta2 power accumulator should have 1 dimension");
+
+    auto param_dims = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Grad"),
+        "Param and Grad input of AdamOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment1"),
+        "Param and Moment1 input of AdamOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment2"),
+        "Param and Moment2 input of AdamOp should have same dimension");
+
+    ctx->SetOutputDim("ParamOut", param_dims);
+    ctx->SetOutputDim("Moment1Out", param_dims);
+    ctx->SetOutputDim("Moment2Out", param_dims);
+  }
+};
+
+class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AdamOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+    AddInput("Moment1", "(Tensor) Input first moment");
+    AddInput("Moment2", "(Tensor) Input second moment");
+    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
+    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("Moment1Out", "(Tensor) Output first moment");
+    AddOutput("Moment2Out", "(Tensor) Output second moment");
+
+    AddAttr<float>("beta1",
+                   "(float, default 0.9) "
+                   "Exponential decay rate for the "
+                   "first moment estimates.")
+        .SetDefault(0.9f);
+    AddAttr<float>("beta2",
+                   "(float, default 0.999) "
+                   "exponential decay rate for the "
+                   "second moment estimates.")
+        .SetDefault(0.999f);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-8) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-8f);
+
+    AddComment(R"DOC(
+Adam Optimizer.
+
+This implements the Adam optimizer from Section 2 of the Adam
+paper : https://arxiv.org/abs/1412.6980.
+Adam is a first-order gradient-based optimization method based on
+adaptive estimates of lower-order moments.
+
+Adam updates:
+
+$$
+moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
+moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
+learning\_rate = learning\_rate *
+                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
+param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+$$
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    adam, ops::AdamOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdamOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/adam_op.cu b/paddle/fluid/operators/adam_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..85b806eb6a1c0ed28cd47331786b66fc2e3a21eb
--- /dev/null
+++ b/paddle/fluid/operators/adam_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/adam_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    adam, ops::AdamOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdamOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a51b46ef15778cf83d4d4f9c2d8f366b1c5d6b9f
--- /dev/null
+++ b/paddle/fluid/operators/adam_op.h
@@ -0,0 +1,229 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <math.h>  // for sqrt in CPU and CUDA
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+namespace scatter = paddle::operators::math::scatter;
+
+template <typename T>
+struct AdamFunctor {
+  T beta1_;
+  T beta2_;
+  T epsilon_;
+
+  const T* beta1_pow_;
+  const T* beta2_pow_;
+  const T* moment1_;
+  T* moment1_out_;
+  const T* moment2_;
+  T* moment2_out_;
+  const T* lr_;
+  const T* grad_;
+  const T* param_;
+  T* param_out_;
+
+  AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
+              const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2,
+              T* mom2_out, const T* lr, const T* grad, const T* param,
+              T* param_out)
+      : beta1_(beta1),
+        beta2_(beta2),
+        epsilon_(epsilon),
+        beta1_pow_(beta1_pow),
+        beta2_pow_(beta2_pow),
+        moment1_(mom1),
+        moment1_out_(mom1_out),
+        moment2_(mom2),
+        moment2_out_(mom2_out),
+        lr_(lr),
+        grad_(grad),
+        param_(param),
+        param_out_(param_out) {}
+
+  inline HOSTDEVICE void operator()(size_t i) const {
+    // Merge all memory access together.
+    T g = grad_[i];
+    T mom1 = moment1_[i];
+    T mom2 = moment2_[i];
+    T lr = *lr_;
+    T beta1_pow = *beta1_pow_;
+    T beta2_pow = *beta2_pow_;
+    T p = param_[i];
+
+    // Calculation
+    lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
+    mom1 = beta1_ * mom1 + (1 - beta1_) * g;
+    mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
+    p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
+
+    // Write back to global memory
+    moment1_out_[i] = mom1;
+    moment2_out_[i] = mom2;
+    param_out_[i] = p;
+  }
+};
+
+template <typename T>
+struct SparseAdamFunctor {
+  T beta1_;
+  T beta2_;
+  T epsilon_;
+
+  const T* beta1_pow_;
+  const T* beta2_pow_;
+  const T* moment1_;
+  T* moment1_out_;
+  const T* moment2_;
+  T* moment2_out_;
+  const T* lr_;
+  const T* grad_;
+  const T* param_;
+  T* param_out_;
+
+  const int64_t* rows_;
+  int64_t row_numel_;
+
+  SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
+                    const T* beta2_pow, const T* mom1, T* mom1_out,
+                    const T* mom2, T* mom2_out, const T* lr, const T* grad,
+                    const T* param, T* param_out, const int64_t* rows,
+                    int64_t row_numel)
+      : beta1_(beta1),
+        beta2_(beta2),
+        epsilon_(epsilon),
+        beta1_pow_(beta1_pow),
+        beta2_pow_(beta2_pow),
+        moment1_(mom1),
+        moment1_out_(mom1_out),
+        moment2_(mom2),
+        moment2_out_(mom2_out),
+        lr_(lr),
+        grad_(grad),
+        param_(param),
+        param_out_(param_out),
+        rows_(rows),
+        row_numel_(row_numel) {}
+
+  inline HOSTDEVICE void operator()(size_t i) const {
+    T beta1_pow = *beta1_pow_;
+    T beta2_pow = *beta2_pow_;
+    for (int64_t j = 0; j < row_numel_; ++j) {
+      T g = grad_[i * row_numel_ + j];
+      T mom1 = moment1_[rows_[i] * row_numel_ + j];
+      T mom2 = moment2_[rows_[i] * row_numel_ + j];
+      T lr = *lr_;
+      T p = param_[rows_[i] * row_numel_ + j];
+
+      lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
+      mom1 = beta1_ * mom1 + (1 - beta1_) * g;
+      mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
+      p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
+
+      moment1_out_[rows_[i] * row_numel_ + j] = mom1;
+      moment2_out_[rows_[i] * row_numel_ + j] = mom2;
+      param_out_[rows_[i] * row_numel_ + j] = p;
+    }  // for col id
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AdamOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using paddle::framework::LoDTensor;
+    using paddle::operators::detail::Ref;
+
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    auto& param = Ref(ctx.Input<LoDTensor>("Param"), "Must set Param");
+    // auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
+    auto* grad_var = ctx.InputVar("Grad");
+    auto& mom1 = Ref(ctx.Input<LoDTensor>("Moment1"), "Must set Moment1");
+    auto& mom2 = Ref(ctx.Input<LoDTensor>("Moment2"), "Must set Moment2");
+    auto& lr =
+        Ref(ctx.Input<LoDTensor>("LearningRate"), "Must set LearningRate");
+
+    auto& beta1_pow =
+        Ref(ctx.Input<LoDTensor>("Beta1Pow"), "Must set Beta1Pow");
+    auto& beta2_pow =
+        Ref(ctx.Input<LoDTensor>("Beta2Pow"), "Must set Beta2Pow");
+
+    auto& param_out =
+        Ref(ctx.Output<LoDTensor>("ParamOut"), "Must set ParamOut");
+    auto& mom1_out =
+        Ref(ctx.Output<LoDTensor>("Moment1Out"), "Must set Moment1Out");
+    auto& mom2_out =
+        Ref(ctx.Output<LoDTensor>("Moment2Out"), "Must set Moment1Out");
+
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
+      AdamFunctor<T> functor(
+          beta1, beta2, epsilon, beta1_pow.template data<T>(),
+          beta2_pow.template data<T>(), mom1.template data<T>(),
+          mom1_out.template mutable_data<T>(ctx.GetPlace()),
+          mom2.template data<T>(),
+          mom2_out.template mutable_data<T>(ctx.GetPlace()),
+          lr.template data<T>(), grad.template data<T>(),
+          param.template data<T>(),
+          param_out.template mutable_data<T>(ctx.GetPlace()));
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(ctx.device_context()),
+          param.numel());
+      for_range(functor);
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      auto& grad =
+          Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
+      // merge duplicated rows if any.
+      scatter::MergeAdd<DeviceContext, T> merge_func;
+      auto grad_merge =
+          merge_func(ctx.template device_context<DeviceContext>(), grad);
+      auto& grad_tensor = grad_merge.value();
+      const T* grad_data = grad_tensor.template data<T>();
+      int64_t* rows = nullptr;
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace());
+      } else {
+        rows = grad_merge.mutable_rows()->data();
+      }
+      auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
+
+      SparseAdamFunctor<T> functor(
+          beta1, beta2, epsilon, beta1_pow.template data<T>(),
+          beta2_pow.template data<T>(), mom1.template data<T>(),
+          mom1_out.template mutable_data<T>(ctx.GetPlace()),
+          mom2.template data<T>(),
+          mom2_out.template mutable_data<T>(ctx.GetPlace()),
+          lr.template data<T>(), grad_data, param.template data<T>(),
+          param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel);
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(ctx.device_context()),
+          grad_merge.rows().size());
+      for_range(functor);
+    } else {
+      PADDLE_THROW("Variable type not supported by adam_op");
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/adamax_op.cc b/paddle/fluid/operators/adamax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b2249b8f96da86438748ab5b2b0f748cc590b8f7
--- /dev/null
+++ b/paddle/fluid/operators/adamax_op.cc
@@ -0,0 +1,132 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/adamax_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AdamaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment"),
+                   "Input(Moment) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("InfNorm"),
+                   "Input(InfNorm) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
+                   "Input(Beta1Pow) of AdamaxOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
+                   "Output(MomentOut) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("InfNormOut"),
+                   "Output(InfNormOut) of AdamaxOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 dimension");
+    auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
+    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
+                      "Beta1 power accumulator should have 1 dimension");
+    auto param_dims = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Grad"),
+        "Param and Grad input of AdamaxOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment"),
+        "Param and Moment input of AdamaxOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("InfNorm"),
+        "Param and InfNorm input of AdamaxOp should have same dimension");
+
+    ctx->SetOutputDim("ParamOut", param_dims);
+    ctx->SetOutputDim("MomentOut", param_dims);
+    ctx->SetOutputDim("InfNormOut", param_dims);
+  }
+};
+
+class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AdamaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+    AddInput("Moment", "(Tensor) First moment");
+    AddInput("InfNorm",
+             "(Tensor) "
+             "Input exponentially weighted infinity norm");
+    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("MomentOut", "(Tensor) Output first moment");
+    AddOutput("InfNormOut",
+              "(Tensor) "
+              "Output exponentially weighted infinity norm");
+
+    AddAttr<float>("beta1",
+                   "(float, default 0.9) "
+                   "Exponential decay rate for the "
+                   "1st moment estimates.")
+        .SetDefault(0.9f);
+    AddAttr<float>("beta2",
+                   "(float, default 0.999) "
+                   "exponential decay rate for the weighted "
+                   "infinity norm estimates.")
+        .SetDefault(0.999f);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-8) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-8f);
+    AddComment(R"DOC(
+Adamax Optimizer.
+
+We implement the Adamax optimizer from Section 7 of the Adam
+paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the
+Adam algorithm based on the infinity norm.
+
+Adamax updates:
+
+$$
+moment\_out = \beta_1 * moment + (1 - \beta_1) * grad \\
+inf\_norm\_out = max(\beta_2 * inf\_norm + \epsilon, |grad|) \\
+learning\_rate = \frac{learning\_rate}{1 - \beta_{1\_pow}} \\
+param\_out = param - learning\_rate * \frac{moment\_out}{inf\_norm\_out}
+$$
+
+The original paper does not have an epsilon attribute.
+However, it is added here for numerical stability to prevent the
+division by 0 error.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    adamax, ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/adamax_op.cu b/paddle/fluid/operators/adamax_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..44a5d6c7bdeac94ceb710d981c6445c046528cb0
--- /dev/null
+++ b/paddle/fluid/operators/adamax_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/adamax_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    adamax, ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/adamax_op.h b/paddle/fluid/operators/adamax_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..124453c0eceb4caa53bf63a8d9e8c4b90a2213c9
--- /dev/null
+++ b/paddle/fluid/operators/adamax_op.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class AdamaxOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
+    auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut");
+
+    param_out_tensor->mutable_data<T>(ctx.GetPlace());
+    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
+    inf_norm_out_tensor->mutable_data<T>(ctx.GetPlace());
+
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+
+    auto param = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Param"));
+    auto grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Grad"));
+    auto moment = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Moment"));
+    auto inf_norm = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("InfNorm"));
+    auto lr = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("LearningRate"));
+    auto beta1_pow = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Beta1Pow"));
+    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
+    auto inf_norm_out =
+        framework::EigenVector<T>::Flatten(*inf_norm_out_tensor);
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
+
+    moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad;
+    inf_norm_out.device(*place) =
+        grad.abs().cwiseMax((beta2 * inf_norm) + epsilon);
+    auto lr_t = lr / (1 - beta1_pow);
+    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
+    param_out.device(*place) =
+        param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ffb414ecea350006e5a370a0b25ae304cace89c
--- /dev/null
+++ b/paddle/fluid/operators/array_operator.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+class ArrayOp : public framework::OperatorBase {
+ public:
+  ArrayOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  size_t GetOffset(const framework::Scope &scope,
+                   const platform::Place &place) const {
+    auto *i = scope.FindVar(Input("I"));
+    PADDLE_ENFORCE(i != nullptr, "I must be set");
+    auto &i_tensor = i->Get<framework::LoDTensor>();
+    PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    size_t offset;
+    if (platform::is_gpu_place(i_tensor.place())) {
+      // FIXME: Avoid copy from GPU to CPU
+      framework::Tensor t;
+      framework::Copy(i_tensor, platform::CPUPlace(), dev_ctx, &t);
+      dev_ctx.Wait();
+      offset = static_cast<size_t>(*t.data<int64_t>());
+    } else {
+      offset = static_cast<size_t>(*i_tensor.data<int64_t>());
+    }
+    VLOG(10) << " Offset = " << offset;
+    return offset;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bf8e11bd8c047275fe341ead9424d02e98d5d8f4
--- /dev/null
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <numeric>
+
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+using LoD = framework::LoD;
+
+class ArrayToLoDTensorOp : public framework::OperatorBase {
+ public:
+  ArrayToLoDTensorOp(const std::string &type,
+                     const framework::VariableNameMap &inputs,
+                     const framework::VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
+    auto &rank_table =
+        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+
+    // Check dims, place and data type of input's elements and infer output's
+    // dim
+    PADDLE_ENFORCE(!x.empty(), "There's no element in the input array.");
+    int rank = x[0].dims().size();
+    platform::Place place = x[0].place();
+    std::type_index data_type = x[0].type();
+    framework::DDim ins_dims = framework::slice_ddim(x[0].dims(), 1, rank);
+    int64_t batch_size = x[0].dims()[0];
+    for (size_t i = 1; i < x.size(); ++i) {
+      PADDLE_ENFORCE_EQ(framework::slice_ddim(x[i].dims(), 1, rank), ins_dims,
+                        "The dimension of the %zu'th element in LoDTensorArray "
+                        "differs from previous ones.",
+                        i);
+      PADDLE_ENFORCE(platform::places_are_same_class(x[i].place(), place),
+                     "The place class of the %zu'th element in LoDTensorArray "
+                     "differs from previous ones.",
+                     i);
+      PADDLE_ENFORCE(x[i].type() == data_type,
+                     "The date type of the %zu'th element in LoDTensorArray "
+                     "differs from previous ones.",
+                     i);
+      batch_size += x[i].dims()[0];
+    }
+    auto ins_dim_vec = framework::vectorize(ins_dims);
+    ins_dim_vec.insert(ins_dim_vec.begin(), batch_size);
+    framework::DDim out_dims = framework::make_ddim(ins_dim_vec);
+    out->Resize(out_dims);
+    out->mutable_data(place, data_type);
+
+    auto &table_items = rank_table.items();
+    std::vector<size_t> table_item_idx(table_items.size());
+    // table_item_idx = range(table_items_idx.size())
+    std::iota(table_item_idx.begin(), table_item_idx.end(), 0);
+    std::sort(table_item_idx.begin(), table_item_idx.end(),
+              [&](size_t a, size_t b) {
+                return table_items[a].index < table_items[b].index;
+              });
+
+    // Build LoDTensor `out`
+    framework::LoD *out_lod = out->mutable_lod();
+    out_lod->clear();
+    size_t out_offset = 0;
+    auto prefix_lod = rank_table.coarse_lod();
+    prefix_lod.emplace_back();
+    auto &cur_level_lod = prefix_lod.back();
+    cur_level_lod.push_back(0);
+    for (size_t idx : table_item_idx) {
+      cur_level_lod.push_back(cur_level_lod.back() + table_items[idx].length);
+      for (size_t x_idx = 0; x_idx < table_items[idx].length; ++x_idx) {
+        auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+            x[x_idx].lod(), idx, idx + 1, 0);
+
+        auto &lod_length = lod_and_offset.first;
+        framework::AppendLoD(out_lod, lod_length);
+
+        size_t start_offset = lod_and_offset.second.first;
+        size_t end_offset = lod_and_offset.second.second;
+        VLOG(10) << "idx=" << idx << " x_idx=" << x_idx << " ["
+                 << ", " << end_offset << "]";
+        // Copy data
+        PADDLE_ENFORCE_GE(end_offset, start_offset);
+        size_t len = end_offset - start_offset;
+        if (len == 0) {
+          continue;
+        }
+        auto slice = out->Slice(out_offset, out_offset + len);
+
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(place);
+
+        framework::Copy(x[x_idx].Slice(start_offset, end_offset), place,
+                        dev_ctx, &slice);
+        out_offset += len;
+      }
+    }
+    out_lod->insert(out_lod->begin(), prefix_lod.begin(), prefix_lod.end());
+  }
+};
+
+class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ArrayToLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(std::vector<LodTensor>) A vector of tensors that is going to "
+             "be casted to a big LoDTensor.");
+    AddInput("RankTable",
+             "(LoDRankTable) RankTable provides the coarse lod infomation to "
+             "build the output LoDTensor. See "
+             "'paddle/framework/lod_rank_table.h' for more details.");
+    AddOutput("Out", "(LoDTensor) The LoDTensor formed by input tensor array.");
+    AddComment(
+        R"DOC(This Op build a big LoDTensor from a std::vector<LoDTensor> 
+          and a LoDRankTable. It is supposed to be used in getting dynamic RNN's
+          outputs back to a normal LoDTensor. The std::vector<LoDTensor> 
+          would be the output of RNN Op and the LoDRankTable would be build 
+          with RNN's input.)DOC");
+  }
+};
+
+class ArrayToLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "ArrayToLoDTensorOp must has input X.");
+    PADDLE_ENFORCE(context->HasInput("RankTable"),
+                   "ArrayToLoDTensorOp must has input RankTable.");
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+  }
+};
+
+class ArrayToLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("lod_tensor_to_array");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetInput("RankTable", Input("RankTable"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(array_to_lod_tensor, ops::ArrayToLoDTensorOp,
+                  ops::ArrayToLoDTensorOpProtoMaker,
+                  ops::ArrayToLoDTensorInferShape,
+                  ops::ArrayToLoDTensorGradMaker);
diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f99f9af4276c0e8928f821ae166d55aed02e8e27
--- /dev/null
+++ b/paddle/fluid/operators/assign_op.cc
@@ -0,0 +1,143 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+class AssignFunctor {
+ public:
+  AssignFunctor(framework::Variable *out,
+                const platform::DeviceContext &dev_ctx)
+      : out_(out), dev_ctx_(dev_ctx) {}
+
+  void operator()(const framework::LoDTensor &lod_tensor) const {
+    auto &out_tensor = *out_->GetMutable<framework::LoDTensor>();
+    copy_tensor(lod_tensor, &out_tensor);
+  }
+
+  void operator()(const framework::LoDTensorArray &array) const {
+    auto &out_array = *out_->GetMutable<framework::LoDTensorArray>();
+    out_array.resize(array.size());
+    for (size_t i = 0; i < array.size(); ++i) {
+      copy_tensor(array[i], &out_array[i]);
+    }
+  }
+
+  void operator()(const framework::SelectedRows &rows) const {
+    framework::SelectedRows &out_rows =
+        *out_->GetMutable<framework::SelectedRows>();
+    out_rows.set_rows(rows.rows());
+    out_rows.set_height(rows.height());
+    auto &t = rows.value();
+    auto *m = out_rows.mutable_value();
+    framework::Copy(t, t.place(), dev_ctx_, m);
+  }
+
+  template <typename T>
+  void operator()(const T &v) const {
+    PADDLE_THROW("Not support type for assign op %s", typeid(T).name());
+  }
+
+ private:
+  void copy_tensor(const framework::LoDTensor &lod_tensor,
+                   framework::LoDTensor *out) const {
+    auto &out_tensor = *out;
+    Copy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
+    out_tensor.set_lod(lod_tensor.lod());
+  }
+
+  framework::Variable *out_;
+  const platform::DeviceContext &dev_ctx_;
+};
+
+class AssignOp : public framework::OperatorBase {
+ public:
+  AssignOp(const std::string &type, const framework::VariableNameMap &inputs,
+           const framework::VariableNameMap &outputs,
+           const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto *x = scope.FindVar(Input("X"));
+    if (x == nullptr) {
+      return;
+    }
+    auto *out = scope.FindVar(Output("Out"));
+    PADDLE_ENFORCE(
+        out != nullptr,
+        "The Output(Out) should not be null if the Input(X) is set.");
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
+  }
+};
+
+class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AssignOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor, SelectedRows or LoDTensorArray) The input variable "
+             "could be LoDTensor, SelectedRows or LoDTensorArray.")
+        .AsDispensable();
+    AddOutput("Out",
+              "(LoDTensor, SelectedRows or LoDTensorArray) The type of output "
+              "is the same as input X.");
+    AddComment(R"DOC(Assign Operator
+
+Out = X,  when type in [LoDTensor/SelectedRows/LoDTensorArray]
+raise error if the type is not listed above.
+)DOC");
+  }
+};
+
+class AssignInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    if (context->HasInput("X")) {
+      auto type = context->GetInputsVarType("X")[0];
+      if (type == framework::proto::VarDesc_VarType_SELECTED_ROWS ||
+          type == framework::proto::VarDesc_VarType_LOD_TENSOR) {
+        context->SetOutputDim("Out", context->GetInputDim("X"));
+      }
+    }
+  }
+};
+
+class AssignGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op = new framework::OpDesc();
+    op->SetType("assign");
+    op->SetInput("X", OutputGrad("Out"));
+    op->SetOutput("Out", InputGrad("X"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(assign, ops::AssignOp, ops::AssignGradMaker,
+                  ops::AssignInferShape, ops::AssignOpProtoMaker);
diff --git a/paddle/fluid/operators/assign_value_op.cc b/paddle/fluid/operators/assign_value_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..835043d9ab49a20a73d5dd0fff936cb3e9473b1e
--- /dev/null
+++ b/paddle/fluid/operators/assign_value_op.cc
@@ -0,0 +1,73 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/assign_value_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AssignValueOp : public framework::OperatorWithKernel {
+ public:
+  AssignValueOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of AssignValueOp should not be null.");
+    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    ctx->SetOutputDim("Out", framework::make_ddim(shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::proto::DataType(ctx.Attr<int>("dtype")), ctx.GetPlace());
+  }
+};
+
+class AssignValueOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AssignValueOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "(Tensor) Output tensor of assign_value operator.");
+    AddAttr<std::vector<int>>("shape",
+                              "(vector<int>) "
+                              "Shape of values.");
+    AddAttr<int>("dtype", "data type of values")
+        .InEnum({framework::proto::DataType::INT32,
+                 framework::proto::DataType::FP32});
+    AddAttr<std::vector<float>>("fp32_values", "store the float values")
+        .SetDefault({});
+    AddAttr<std::vector<int>>("int32_values", "store the int values")
+        .SetDefault({});
+    AddComment(R"DOC(
+AssignValue operator
+
+$$Out = values$$
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker);
+REGISTER_OP_CPU_KERNEL(assign_value, ops::AssignValueKernel<int>,
+                       ops::AssignValueKernel<float>);
diff --git a/paddle/fluid/operators/assign_value_op.cu.cc b/paddle/fluid/operators/assign_value_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..616163f97b9b917187ff66339c01f95289f2f618
--- /dev/null
+++ b/paddle/fluid/operators/assign_value_op.cu.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/assign_value_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
+                        ops::AssignValueKernel<float>);
diff --git a/paddle/fluid/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..33a344cad596a079faf2582ee1d9dc497531465a
--- /dev/null
+++ b/paddle/fluid/operators/assign_value_op.h
@@ -0,0 +1,50 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class AssignValueKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto shape = ctx.Attr<std::vector<int>>("shape");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    int dtype = ctx.Attr<int>("dtype");
+    const char* value_name = nullptr;
+    switch (dtype) {
+      case framework::proto::DataType::INT32:
+        value_name = "int32_values";
+        break;
+      case framework::proto::DataType::FP32:
+        value_name = "fp32_values";
+        break;
+      default:
+        PADDLE_THROW("Unsupported dtype for assign_value_op: %d", dtype);
+        break;
+    }
+    auto values = ctx.Attr<std::vector<T>>(value_name);
+    framework::CopyFromVector(values, ctx.device_context(), out);
+    out->Resize(framework::make_ddim(shape));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8ac08ea4a19b981b0dc8dac43e4ae5de7b09bb5d
--- /dev/null
+++ b/paddle/fluid/operators/auc_op.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/auc_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AucOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Indices"),
+                   "Input of Indices should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"),
+                   "Input of Label should not be null.");
+    auto inference_height = ctx->GetInputDim("Out")[0];
+    auto label_height = ctx->GetInputDim("Label")[0];
+
+    PADDLE_ENFORCE_EQ(inference_height, label_height,
+                      "Out and Label should have same height.");
+
+    ctx->SetOutputDim("AUC", {1});
+    ctx->ShareLoD("Out", /*->*/ "AUC");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
+        ctx.device_context());
+  }
+};
+
+class AucOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AucOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Out",
+             "A floating point 2D tensor, values are in the range [0, 1]."
+             "Each row is sorted in descending order. This input should be the"
+             "output of topk."
+             "Typically, this tensor indicates the probability of each label");
+    AddInput("Indices",
+             "An int 2D tensor, indicating the indices of original"
+             "tensor before sorting. Typically, this tensor indicates which "
+             "label the probability stands for.");
+    AddInput("Label",
+             "A 2D int tensor indicating the label of the training data."
+             "The height is batch size and width is always 1.");
+    // TODO(typhoonzero): support weight input
+    AddOutput("AUC",
+              "A scalar representing the "
+              "current area-under-the-curve.");
+
+    AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
+        .SetDefault("ROC");
+    AddAttr<int>("num_thresholds",
+                 "The number of thresholds to use when discretizing the"
+                 " roc curve.")
+        .SetDefault(200);
+
+    AddComment(R"DOC(
+Area Under The Curve (AUC) Operator.
+
+This implementation computes the AUC according to forward output and label.
+It is used very widely in binary classification evaluation. As a note:
+If input label contains values other than 0 and 1, it will be cast
+to bool. You can find the relevant definitions here:
+https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
+
+There are two types of possible curves:
+1. ROC: Receiver operating characteristic
+2. PR: Precision Recall
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker);
+REGISTER_OP_CPU_KERNEL(auc, ops::AucKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e648db70974087f84020f45c568fb0c1924a88dd
--- /dev/null
+++ b/paddle/fluid/operators/auc_op.h
@@ -0,0 +1,132 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class AucKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* inference = ctx.Input<Tensor>("Out");
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* auc = ctx.Output<Tensor>("AUC");
+
+    float* auc_data = auc->mutable_data<float>(ctx.GetPlace());
+
+    std::string curve = ctx.Attr<std::string>("curve");
+    int num_thresholds = ctx.Attr<int>("num_thresholds");
+    std::vector<float> thresholds_list;
+    thresholds_list.reserve(num_thresholds);
+    for (int i = 1; i < num_thresholds - 1; i++) {
+      thresholds_list[i] = (float)i / (num_thresholds - 1);
+    }
+    const float kEpsilon = 1e-7;
+    thresholds_list[0] = 0.0f - kEpsilon;
+    thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon;
+
+    size_t batch_size = inference->dims()[0];
+    size_t inference_width = inference->dims()[1];
+
+    const T* inference_data = inference->data<T>();
+    const int64_t* label_data = label->data<int64_t>();
+
+    // Create local tensor for storing the curve: TP, FN, TN, FP
+    // TODO(typhoonzero): use eigen op to caculate these values.
+    Tensor true_positive, false_positive, true_negative, false_negative;
+
+    true_positive.Resize({num_thresholds});
+    false_negative.Resize({num_thresholds});
+    true_negative.Resize({num_thresholds});
+    false_positive.Resize({num_thresholds});
+
+    int64_t* tp_data = true_positive.mutable_data<int64_t>(ctx.GetPlace());
+    int64_t* fn_data = false_negative.mutable_data<int64_t>(ctx.GetPlace());
+    int64_t* tn_data = true_negative.mutable_data<int64_t>(ctx.GetPlace());
+    int64_t* fp_data = false_positive.mutable_data<int64_t>(ctx.GetPlace());
+
+    for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
+      // caculate TP, FN, TN, FP for current thresh
+      int64_t tp = 0, fn = 0, tn = 0, fp = 0;
+      for (size_t i = 0; i < batch_size; i++) {
+        // NOTE: label_data used as bool, labels >0 will be treated as true.
+        if (label_data[i]) {
+          // use first(max) data in each row
+          if (inference_data[i * inference_width] >=
+              (thresholds_list[idx_thresh])) {
+            tp++;
+          } else {
+            fn++;
+          }
+        } else {
+          if (inference_data[i * inference_width] >=
+              (thresholds_list[idx_thresh])) {
+            fp++;
+          } else {
+            tn++;
+          }
+        }
+      }
+      // store rates
+      tp_data[idx_thresh] = tp;
+      fn_data[idx_thresh] = fn;
+      tn_data[idx_thresh] = tn;
+      fp_data[idx_thresh] = fp;
+    }
+    // epsilon to avoid divide by zero.
+    float epsilon = 1e-6;
+    // Riemann sum to caculate auc.
+    Tensor tp_rate, fp_rate, rec_rate;
+    tp_rate.Resize({num_thresholds});
+    fp_rate.Resize({num_thresholds});
+    rec_rate.Resize({num_thresholds});
+    float* tp_rate_data = tp_rate.mutable_data<float>(ctx.GetPlace());
+    float* fp_rate_data = fp_rate.mutable_data<float>(ctx.GetPlace());
+    float* rec_rate_data = rec_rate.mutable_data<float>(ctx.GetPlace());
+    for (int i = 0; i < num_thresholds; i++) {
+      tp_rate_data[i] =
+          ((float)tp_data[i] + epsilon) / (tp_data[i] + fn_data[i] + epsilon);
+      fp_rate_data[i] = (float)fp_data[i] / (fp_data[i] + tn_data[i] + epsilon);
+      rec_rate_data[i] =
+          ((float)tp_data[i] + epsilon) / (tp_data[i] + fp_data[i] + epsilon);
+    }
+    *auc_data = 0.0f;
+    if (curve == "ROC") {
+      for (int i = 0; i < num_thresholds - 1; i++) {
+        auto dx = fp_rate_data[i] - fp_rate_data[i + 1];
+        auto y = (tp_rate_data[i] + tp_rate_data[i + 1]) / 2.0f;
+        *auc_data = *auc_data + dx * y;
+      }
+    } else if (curve == "PR") {
+      for (int i = 1; i < num_thresholds; i++) {
+        auto dx = tp_rate_data[i] - tp_rate_data[i - 1];
+        auto y = (rec_rate_data[i] + rec_rate_data[i - 1]) / 2.0f;
+        *auc_data = *auc_data + dx * y;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..506c25d50d453ef841e6885c412ccff38f25cebb
--- /dev/null
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -0,0 +1,448 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/fluid/framework/data_layout.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+class BatchNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "");
+    PADDLE_ENFORCE(ctx->HasInput("Scale"), "");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"), "");
+    PADDLE_ENFORCE(ctx->HasInput("Mean"), "");
+    PADDLE_ENFORCE(ctx->HasInput("Variance"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("MeanOut"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), "");
+
+    // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
+    PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
+                      "Mean and MeanOut should share the same memory");
+    PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0],
+                      ctx->Outputs("VarianceOut")[0],
+                      "Variance and VarianceOut should share the same memory");
+
+    const auto x_dims = ctx->GetInputDim("X");
+    const DataLayout data_layout = framework::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
+
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "Input X must have 2 to 5 dimensions.");
+
+    const int64_t C =
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
+
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C);
+
+    ctx->SetOutputDim("Y", x_dims);
+    ctx->SetOutputDim("MeanOut", {C});
+    ctx->SetOutputDim("VarianceOut", {C});
+    ctx->SetOutputDim("SavedMean", {C});
+    ctx->SetOutputDim("SavedVariance", {C});
+    ctx->ShareLoD("X", "Y");
+  }
+};
+
+class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BatchNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddAttr<float>("momentum", "").SetDefault(0.9);
+    AddAttr<float>("epsilon", "")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
+                         "'epsilon' should be between 0.0 and 0.001.");
+        });
+    AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
+    AddInput("X", "The input tensor");
+    AddInput("Scale",
+             "Scale is a 1-dimensional tensor of size C "
+             "that is applied to the output");
+    AddInput("Bias",
+             "Bias is a 1-dimensional tensor of size C "
+             "that is applied to the output");
+    AddInput("Mean",
+             "The global mean (for training) or "
+             "estimated mean (for testing)");
+    AddInput("Variance",
+             "The global variance (for training) "
+             "or estimated Variance (for testing)");
+    AddOutput("Y", "result after normalization");
+    AddOutput("MeanOut",
+              "Share memory with Mean. "
+              "Store the global mean when training");
+    AddOutput("VarianceOut",
+              "Share memory with Variance. "
+              "Store the global Variance when training");
+    AddOutput("SavedMean",
+              "Mean of the current mini batch, "
+              "will apply to output when training")
+        .AsIntermediate();
+    AddOutput("SavedVariance",
+              "Variance of the current mini batch, "
+              "will apply to output when training")
+        .AsIntermediate();
+    AddComment(R"DOC(
+Batch Normalization.
+
+Batch Norm has been implemented as discussed in the paper:
+https://arxiv.org/pdf/1502.03167.pdf
+Can be used as a normalizer function for conv2d and fully_connected operations.
+The required data format for this layer is one of the following:
+1. NHWC `[batch, in_height, in_width, in_channels]`
+2. NCHW `[batch, in_channels, in_height, in_width]`
+
+)DOC");
+  }
+};
+
+template <typename T>
+class BatchNormKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    const float momentum = ctx.Attr<float>("momentum");
+    const bool is_test = ctx.Attr<bool>("is_test");
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto &x_dims = x->dims();
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
+    const int N = x_dims[0];
+    const int C =
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
+    const int sample_size = x->numel() / N / C;
+
+    auto *y = ctx.Output<Tensor>("Y");
+    auto *mean_out = ctx.Output<Tensor>("MeanOut");
+    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+
+    // alloc memory
+    y->mutable_data<T>(ctx.GetPlace());
+    mean_out->mutable_data<T>(ctx.GetPlace());
+    variance_out->mutable_data<T>(ctx.GetPlace());
+    saved_mean->mutable_data<T>(ctx.GetPlace());
+    saved_variance->mutable_data<T>(ctx.GetPlace());
+
+    if (!is_test) {
+      // saved_xx is use just in this batch of data
+      EigenVectorArrayMap<T> saved_mean_e(
+          saved_mean->mutable_data<T>(ctx.GetPlace()), C);
+      EigenVectorArrayMap<T> saved_variance_e(
+          saved_variance->mutable_data<T>(ctx.GetPlace()), C);
+      saved_mean_e.setZero();
+      saved_variance_e.setZero();
+
+      switch (data_layout) {
+        case DataLayout::kNCHW: {
+          ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+          for (int nc = 0; nc < N * C; ++nc) {
+            saved_mean_e(nc % C) += x_arr.col(nc).sum();
+          }
+          saved_mean_e /= N * sample_size;
+          for (int nc = 0; nc < N * C; ++nc) {
+            saved_variance_e(nc % C) +=
+                (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
+          }
+          saved_variance_e /= N * sample_size;
+          break;
+        }
+        case DataLayout::kNHWC: {
+          ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
+          for (int i = 0; i < N * sample_size; ++i) {
+            saved_mean_e += x_arr.col(i);
+          }
+          saved_mean_e /= N * sample_size;
+          for (int i = 0; i < N * sample_size; ++i) {
+            saved_variance_e +=
+                (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e);
+          }
+          saved_variance_e /= N * sample_size;
+          break;
+        }
+        default:
+          PADDLE_THROW("Unknown storage order: %s", data_layout_str);
+      }
+
+      EigenVectorArrayMap<T> running_mean_arr(
+          mean_out->mutable_data<T>(ctx.GetPlace()), C);
+      EigenVectorArrayMap<T> running_var_arr(
+          variance_out->mutable_data<T>(ctx.GetPlace()), C);
+      running_mean_arr =
+          running_mean_arr * momentum + saved_mean_e * (1. - momentum);
+      running_var_arr =
+          running_var_arr * momentum + saved_variance_e * (1. - momentum);
+    }
+
+    // use SavedMean and SavedVariance to do normalize
+    Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
+    if (is_test) {
+      ConstEigenVectorArrayMap<T> var_arr(
+          ctx.Input<Tensor>("Variance")->data<T>(), C);
+      inv_std = (var_arr + epsilon).sqrt().inverse();
+    } else {
+      EigenVectorArrayMap<T> saved_inv_std(
+          ctx.Output<Tensor>("SavedVariance")->data<T>(), C);
+      // inverse SavedVariance first, gradient will use it too.
+      saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt();
+      inv_std = saved_inv_std;
+    }
+    ConstEigenVectorArrayMap<T> mean_arr(
+        is_test ? ctx.Input<Tensor>("Mean")->data<T>()
+                : ctx.Output<Tensor>("SavedMean")->data<T>(),
+        C);
+
+    //   ((x - est_mean) * (inv_var) * scale + bias
+    //   formula transform ====>
+    //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
+    ConstEigenVectorArrayMap<T> bias_arr(bias->data<T>(), C);
+    Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
+    Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
+        bias_arr - mean_arr * inv_std * scale_arr;
+
+    switch (data_layout) {
+      case DataLayout::kNCHW: {
+        EigenArrayMap<T> y_arr(y->mutable_data<T>(ctx.GetPlace()), sample_size,
+                               N * C);
+        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+        for (int nc = 0; nc < N * C; ++nc) {
+          y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
+        }
+        break;
+      }
+      case DataLayout::kNHWC: {
+        EigenArrayMap<T>(y->mutable_data<T>(ctx.GetPlace()), C,
+                         N * sample_size) =
+            (ConstEigenArrayMap<T>(x->data<T>(), C, N * sample_size).colwise() *
+             new_scale)
+                .colwise() +
+            new_bias;
+        break;
+      }
+      default:
+        PADDLE_THROW("Unknown storage order: %d", data_layout);
+    }
+  }
+};
+
+class BatchNormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // check input
+    PADDLE_ENFORCE(ctx->HasInput("X"));
+    PADDLE_ENFORCE(ctx->HasInput("Scale"), "");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), "");
+    PADDLE_ENFORCE(ctx->HasInput("SavedMean"), "");
+    PADDLE_ENFORCE(ctx->HasInput("SavedVariance"), "");
+
+    // check output
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Scale")), "");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), "");
+
+    const auto x_dims = ctx->GetInputDim("X");
+    const DataLayout data_layout = framework::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
+    const int C =
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
+
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
+    ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+    if (var == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    const Tensor *t = nullptr;
+    if (var->IsType<Tensor>()) {
+      t = &var->Get<Tensor>();
+    } else if (var->IsType<LoDTensor>()) {
+      t = &var->Get<LoDTensor>();
+    }
+    if (t == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    return framework::OpKernelType(framework::ToDataType(t->type()),
+                                   ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class BatchNormGradKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+    // SavedVariance have been reverted in forward operator
+    const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+
+    // Get the size for each dimension.
+    // NCHW [batch_size, in_channels, in_height, in_width]
+    const auto &x_dims = x->dims();
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
+    const int N = x_dims[0];
+    const int C =
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
+    const int sample_size = x->numel() / N / C;
+
+    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
+    ConstEigenVectorArrayMap<T> mean_arr(saved_mean->data<T>(), C);
+    ConstEigenVectorArrayMap<T> inv_var_arr(saved_inv_variance->data<T>(), C);
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    d_x->mutable_data<T>(ctx.GetPlace());
+    d_scale->mutable_data<T>(ctx.GetPlace());
+    d_bias->mutable_data<T>(ctx.GetPlace());
+
+    // d_bias = np.sum(d_y, axis=0)
+    // d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
+    // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
+    //   - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
+
+    EigenVectorArrayMap<T> d_bias_arr(d_bias->mutable_data<T>(ctx.GetPlace()),
+                                      C);
+    EigenVectorArrayMap<T> d_scale_arr(d_scale->mutable_data<T>(ctx.GetPlace()),
+                                       C);
+
+    d_bias_arr.setZero();
+    d_scale_arr.setZero();
+
+    const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size);
+
+    switch (data_layout) {
+      case DataLayout::kNCHW: {
+        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
+        EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()),
+                                 sample_size, N * C);
+        d_x_arr.setZero();
+
+        for (int nc = 0; nc < N * C; ++nc) {
+          int c = nc % C;
+          d_bias_arr(c) += d_y_arr.col(nc).sum();
+          d_scale_arr(c) +=
+              ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
+                  .sum();
+        }
+        for (int nc = 0; nc < N * C; ++nc) {
+          int c = nc % C;
+          d_x_arr.col(nc) +=
+              scale_inv_var_nhw(c) *
+              (d_y_arr.col(nc) * N * sample_size - d_bias_arr(c) -
+               (x_arr.col(nc) - mean_arr[c]) * d_scale_arr(c) * inv_var_arr(c));
+        }
+        break;
+      }
+      case DataLayout::kNHWC: {
+        ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
+        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
+        EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()), C,
+                                 N * sample_size);
+        d_x_arr.setZero();
+
+        const auto d_y_row_sum = d_y_arr.rowwise().sum();
+        const auto x_minus_mean = x_arr.colwise() - mean_arr;
+        const auto d_y_mul_x_minus_mean_row_sum =
+            (d_y_arr * x_minus_mean).rowwise().sum();
+        const auto inv_var_sqr = inv_var_arr * inv_var_arr;
+        for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+          d_bias_arr += d_y_arr.col(nhw);
+          d_scale_arr +=
+              (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
+          d_x_arr.col(nhw) +=
+              scale_inv_var_nhw *
+              (d_y_arr.col(nhw) * N * sample_size - d_y_row_sum -
+               x_minus_mean.col(nhw) * inv_var_sqr *
+                   d_y_mul_x_minus_mean_row_sum);
+        }
+        break;
+      }
+      default:
+        PADDLE_THROW("Unknown storage order: %s", data_layout_str);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
+            batch_norm_grad, ops::BatchNormGradOp);
+REGISTER_OP_CPU_KERNEL(
+    batch_norm,
+    ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    batch_norm_grad,
+    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b9c97211e14c0ef3a99a7e2b5cbfd8b267d40c1e
--- /dev/null
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@@ -0,0 +1,278 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/fluid/framework/data_layout.h"
+
+#include <cfloat>
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DataLayout = framework::DataLayout;
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+
+void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout,
+                  int *N, int *C, int *H, int *W, int *D) {
+  *N = dims[0];
+  if (dims.size() == 2) {
+    *C = dims[1];
+    *H = 1;
+    *W = 1;
+    *D = 1;
+  } else {
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
+    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
+    *W = dims.size() > 3
+             ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
+             : 1;
+    *D = dims.size() > 4
+             ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
+             : 1;
+  }
+}
+
+template <typename T>
+class BatchNormKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    const float momentum = ctx.Attr<float>("momentum");
+    const bool is_test = ctx.Attr<bool>("is_test");
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+
+    // Get the size for each dimension.
+    // NCHW [batch_size, in_channels, in_height, in_width]
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto &x_dims = x->dims();
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
+    int N, C, H, W, D;
+    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+
+    // ------------------- cudnn descriptors ---------------------
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_;
+
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+#if CUDNN_VERSION_MIN(7, 0, 0)
+    mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+#else
+    mode_ = CUDNN_BATCHNORM_SPATIAL;
+#endif
+
+    VLOG(1) << "Setting descriptors.";
+    std::vector<int> dims;
+    std::vector<int> strides;
+    if (data_layout == DataLayout::kNCHW) {
+      dims = {N, C, H, W, D};
+      strides = {C * H * W * D, H * W * D, W * D, D, 1};
+    } else {
+      dims = {N, C, H, W, D};
+      strides = {H * W * D * C, 1, W * D * C, D * C, C};
+    }
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        data_desc_, CudnnDataType<T>::type,
+        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
+    CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
+        bn_param_desc_, data_desc_, mode_));
+
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+
+    auto *y = ctx.Output<Tensor>("Y");
+    auto *mean_out = ctx.Output<Tensor>("MeanOut");
+    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+
+    // alloc memory
+    y->mutable_data<T>(ctx.GetPlace());
+    mean_out->mutable_data<T>(ctx.GetPlace());
+    variance_out->mutable_data<T>(ctx.GetPlace());
+    saved_mean->mutable_data<T>(ctx.GetPlace());
+    saved_variance->mutable_data<T>(ctx.GetPlace());
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    math::SetConstant<platform::CUDADeviceContext, T> functor;
+    functor(dev_ctx, saved_mean, 0);
+    functor(dev_ctx, saved_variance, 0);
+
+    auto handle = dev_ctx.cudnn_handle();
+
+    // Now, depending on whether we are running test or not, we have two paths.
+    if (is_test) {
+      // only when test we use input to do computation.
+      const auto *est_mean = ctx.Input<Tensor>("Mean");
+      const auto *est_var = ctx.Input<Tensor>("Variance");
+      // Run inference mode.
+      PADDLE_ENFORCE_EQ(est_mean->dims().size(), 1UL);
+      PADDLE_ENFORCE_EQ(est_var->dims().size(), 1UL);
+      PADDLE_ENFORCE_EQ(est_mean->dims()[0], C);
+      PADDLE_ENFORCE_EQ(est_var->dims()[0], C);
+
+      CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardInference(
+          handle,
+          // Note: PERSISTENT not implemented for inference
+          CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
+          CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
+          data_desc_, y->template mutable_data<T>(ctx.GetPlace()),
+          bn_param_desc_, scale->template data<T>(), bias->template data<T>(),
+          est_mean->template data<T>(), est_var->template data<T>(), epsilon));
+    } else {
+      // Run training mode.
+      // obtain running mean and running inv var, and see if we need to
+      // initialize them.
+      double this_factor = 1. - momentum;
+
+      CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
+          handle, mode_, CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
+          data_desc_, x->template data<T>(), data_desc_,
+          y->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
+          scale->template data<T>(), bias->template data<T>(), this_factor,
+          mean_out->template mutable_data<T>(ctx.GetPlace()),
+          variance_out->template mutable_data<T>(ctx.GetPlace()), epsilon,
+          saved_mean->template mutable_data<T>(ctx.GetPlace()),
+          saved_variance->template mutable_data<T>(ctx.GetPlace())));
+    }
+
+    // clean when exit.
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+  }
+};
+
+template <typename T>
+class BatchNormGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<Tensor>("Scale");
+
+    const auto &x_dims = x->dims();
+
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
+    int N, C, H, W, D;
+    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+
+    PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
+    PADDLE_ENFORCE_EQ(scale->dims()[0], C);
+
+    // ------------------- cudnn descriptors ---------------------
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_;
+
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+#if CUDNN_VERSION_MIN(7, 0, 0)
+    mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+#else
+    mode_ = CUDNN_BATCHNORM_SPATIAL;
+#endif
+
+    std::vector<int> dims;
+    std::vector<int> strides;
+    if (data_layout == DataLayout::kNCHW) {
+      dims = {N, C, H, W, D};
+      strides = {C * H * W * D, H * W * D, W * D, D, 1};
+    } else {
+      dims = {N, C, H, W, D};
+      strides = {H * W * C * D, 1, W * D * C, D * C, C};
+    }
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        data_desc_, CudnnDataType<T>::type,
+        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
+    CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
+        bn_param_desc_, data_desc_, mode_));
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    d_x->mutable_data<T>(ctx.GetPlace());
+    d_scale->mutable_data<T>(ctx.GetPlace());
+    d_bias->mutable_data<T>(ctx.GetPlace());
+
+    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
+    const void *saved_mean_data = saved_mean->template data<T>();
+    const void *saved_var_data = saved_var->template data<T>();
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
+        dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
+        data_desc_, d_y->template data<T>(), data_desc_,
+        d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
+        scale->template data<T>(),
+        d_scale->template mutable_data<T>(ctx.GetPlace()),
+        d_bias->template mutable_data<T>(ctx.GetPlace()), epsilon,
+        saved_mean_data, saved_var_data));
+
+    // clean when exit.
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    batch_norm,
+    ops::BatchNormKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    batch_norm_grad,
+    ops::BatchNormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa9942ad099f4a28a3abc68c676edeeb827aacd0
--- /dev/null
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class BatchNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+template <typename DeviceContext, typename T>
+class BatchNormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7737d4e098ac9a0e56e1db2aee796550e8d71ba3
--- /dev/null
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -0,0 +1,144 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/beam_search_decode_op.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+struct BeamSearchDecodeFunctor {
+  BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
+                          const LoDTensorArray& step_scores,
+                          LoDTensor* id_tensor, LoDTensor* score_tensor)
+      : step_ids_(step_ids),
+        step_scores_(step_scores),
+        id_tensor_(id_tensor),
+        score_tensor_(score_tensor) {}
+
+  template <typename T>
+  void operator()() const;
+
+  const LoDTensorArray& step_ids_;
+  const LoDTensorArray& step_scores_;
+  LoDTensor* id_tensor_;
+  LoDTensor* score_tensor_;
+};
+
+template <typename T>
+void BeamSearchDecodeFunctor::operator()() const {
+  BeamSearchDecoder<T> beam_search_decoder;
+  beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_,
+                                   score_tensor_);
+}
+
+template <>
+void BeamSearchDecodeFunctor::operator()<bool>() const {
+  PADDLE_THROW("beam search decode op does not support bool!");
+}
+
+class BeamSearchDecodeOp : public framework::OperatorBase {
+ public:
+  BeamSearchDecodeOp(const std::string& type,
+                     const framework::VariableNameMap& inputs,
+                     const framework::VariableNameMap& outputs,
+                     const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& dev_ctx = *pool.Get(dev_place);
+
+    framework::ExecutionContext ctx(*this, scope, dev_ctx);
+
+    const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
+    const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores");
+    const size_t step_num = ids->size();
+    PADDLE_ENFORCE_GT(step_num, 0UL,
+                      "beam search steps should be larger than 0");
+    const size_t source_num = ids->at(0).lod().at(0).size() - 1;
+    PADDLE_ENFORCE_GT(source_num, 0UL, "source num should be larger than 0");
+
+    for (size_t i = 0; i < step_num; ++i) {
+      PADDLE_ENFORCE_EQ(ids->at(i).lod().size(), 2UL,
+                        "Level of LodTensor should be 2");
+    }
+
+    // prepare output
+    LoDTensor* sentenceIds = ctx.Output<LoDTensor>("SentenceIds");
+    LoDTensor* sentenceScores = ctx.Output<LoDTensor>("SentenceScores");
+
+    framework::VisitDataType(
+        framework::ToDataType(scores->at(0).type()),
+        BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores));
+  }
+};
+
+class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BeamSearchDecodeOpProtoMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Ids",
+             "(LodTensorArray)"
+             "score of the candidate words in each step");
+    AddInput("Scores",
+             "(LodTensorArray)"
+             "score of the candidate words in each step");
+    AddOutput("SentenceIds",
+              "(LodTensor)"
+              "All possible result sentences of word ids");
+    AddOutput("SentenceScores",
+              "(LodTensor)"
+              "All possible result sentences of word scores");
+    AddComment(R"DOC(
+Pack the result of Beam search op into SentenceIds and SentenceScores.
+)DOC");
+  }
+};
+
+class BeamSearchDecodeInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* context) const override {
+    PADDLE_ENFORCE(context->HasInput("Ids"),
+                   "BeamSearchDecodeOp must has input Ids");
+    PADDLE_ENFORCE(context->HasInput("Scores"),
+                   "BeamSearchDecodeOp must has input Scores");
+    PADDLE_ENFORCE(context->HasOutput("SentenceIds"),
+                   "BeamSearchDecodeOp must has output SentenceIds");
+    PADDLE_ENFORCE(context->HasOutput("SentenceScores"),
+                   "BeamSearchDecodeOp must has output SentenceScores");
+  }
+};
+
+class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    for (auto& o : op_desc.Output("SentenceIds")) {
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
+    }
+    for (auto& o : op_desc.Output("SentenceScores")) {
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(beam_search_decode, paddle::operators::BeamSearchDecodeOp,
+                  paddle::operators::BeamSearchDecodeOpProtoMaker,
+                  paddle::operators::BeamSearchDecodeInferShape,
+                  paddle::operators::BeamSearchDecodeInferVarType,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..aeecb8d39acf1e2761aec62b89322c9cbbfe7445
--- /dev/null
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -0,0 +1,280 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using LoDTensorArray = framework::LoDTensorArray;
+
+// all the lod have 2 levels.
+// The First is source level, the second is sentence level.
+// source level describe how many candidate words for this source.
+// sentence level describe these candidates belong to which prefix
+const size_t kSourceLevel = 0;
+const size_t kSentenceLevel = 1;
+
+template <typename T>
+struct BeamNode {
+  BeamNode(int64_t word_id, T score) : word_id_(word_id), score_(score) {}
+
+  ~BeamNode() {
+    if (parent_) {
+      parent_->DropKid(this);
+      if (parent_->kids_.size() == 0UL) {
+        delete parent_;
+      }
+    }
+    VLOG(3) << "Delete BeamNode root with word_id:" << this->word_id_;
+  }
+
+  void AppendTo(BeamNode* parent) {
+    parent_ = parent;
+    parent->kids_.insert(this);
+  }
+
+  void DropKid(BeamNode* kid) { kids_.erase(kid); }
+
+  BeamNode* parent_ = nullptr;
+  std::unordered_set<BeamNode*> kids_;
+  int64_t word_id_;
+  T score_;
+};
+
+template <typename T>
+using BeamNodeVector = std::vector<std::unique_ptr<BeamNode<T>>>;
+
+template <typename T>
+struct Sentence {
+  std::vector<int64_t> word_ids;
+  std::vector<T> scores;
+};
+
+template <typename T>
+using SentenceVector = std::vector<Sentence<T>>;
+
+template <typename T>
+struct BeamSearchDecoder {
+  /**
+   * make a BeamNode and all it's related prefix BeanNode into a Sentence.
+   */
+  Sentence<T> MakeSentence(const BeamNode<T>* node) const;
+
+  /**
+   * Param:
+   *  cur_ids: LoDTensor of One step for word ID
+   *  cur_scores: LoDTensor of One Step for word score
+   *  prefixes_list: prefixes for each source sentence.
+   *  sentence_vector_list: result sentence_vector for each source sentence.
+   * Return:
+   *  a new prefixes list for each source of current step
+   */
+  std::vector<BeamNodeVector<T>> PackTwoSteps(
+      const LoDTensor& cur_ids, const LoDTensor& cur_scores,
+      std::vector<BeamNodeVector<T>>& prefixes_list,
+      std::vector<SentenceVector<T>>* sentence_vector_list) const;
+
+  /**
+   * convert the result sentence_vector for each source sentence into two
+   * LodTensor.
+   * One is all candidate sentences with word id, one is all candidate sentences
+   * with word score.
+   * Param:
+   *  sentence_vector_list: sentence_vector for each source sentence.
+   *  id_tensor: result LoDTensor for sentences of id.
+   *  score_tensor: result LoDTensor for sentences of score.
+   */
+  void ConvertSentenceVectorToLodTensor(
+      std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
+      LoDTensor* score_tensor) const;
+
+  /**
+   * Pack all steps of id/score LodTensor into sentence LoDTensor
+   * it's main logic is:
+   * ```python
+   *   prefix
+   *   result_sentence
+   *   result_lod_tensor
+   *
+   *   for (step in steps):
+   *     prefix = PackTwoSteps(prefix, step, &result_sentence)
+   *   ConvertSentenceVector<T>ToLodTensor(result_sentence, &result_lod_tensor)
+   * ```
+   */
+  void PackAllSteps(const LoDTensorArray& step_ids,
+                    const LoDTensorArray& step_scores, LoDTensor* id_tensor,
+                    LoDTensor* score_tensor) const;
+};
+
+template <typename T>
+Sentence<T> BeamSearchDecoder<T>::MakeSentence(const BeamNode<T>* node) const {
+  Sentence<T> sentence;
+  while (node != nullptr) {
+    sentence.word_ids.emplace_back(node->word_id_);
+    sentence.scores.emplace_back(node->score_);
+    node = node->parent_;
+  }
+
+  std::reverse(std::begin(sentence.word_ids), std::end(sentence.word_ids));
+  std::reverse(std::begin(sentence.scores), std::end(sentence.scores));
+
+  return sentence;
+}
+
+template <typename T>
+std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
+    const LoDTensor& cur_ids, const LoDTensor& cur_scores,
+    std::vector<BeamNodeVector<T>>& prefixes_list,
+    std::vector<SentenceVector<T>>* sentence_vector_list) const {
+  std::vector<BeamNodeVector<T>> result;
+
+  for (size_t src_idx = 0; src_idx < cur_ids.lod()[kSourceLevel].size() - 1;
+       ++src_idx) {
+    size_t src_start = cur_ids.lod().at(kSourceLevel)[src_idx];
+    size_t src_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
+
+    BeamNodeVector<T> beam_nodes;
+
+    // if prefixes size is 0, it means this is the first step. In this step,
+    // all candidate id is the start of candidate sentences.
+    if (prefixes_list.empty()) {
+      PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(),
+                        cur_ids.lod().at(kSentenceLevel).back(),
+                        "in the first step");
+      for (size_t id_idx = src_start; id_idx < src_end; ++id_idx) {
+        beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(new BeamNode<T>(
+            cur_ids.data<int64_t>()[id_idx], cur_scores.data<T>()[id_idx])));
+      }
+    } else {
+      BeamNodeVector<T>& prefixes = prefixes_list[src_idx];
+      SentenceVector<T>& sentence_vector = (*sentence_vector_list)[src_idx];
+
+      PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(),
+                        "prefix and candidate set number should be the same");
+
+      auto candidate_offset = cur_ids.lod()[kSentenceLevel];
+      for (size_t prefix_idx = 0; prefix_idx < prefixes.size(); ++prefix_idx) {
+        std::unique_ptr<BeamNode<T>>& prefix = prefixes[prefix_idx];
+        size_t candidate_start = candidate_offset[src_start + prefix_idx];
+        size_t candidate_end = candidate_offset[src_start + prefix_idx + 1];
+        if (candidate_start == candidate_end) {
+          VLOG(3) << "this sentence has no more candidate, "
+                     "add to result sentence and rm it from beam tree";
+          sentence_vector.push_back(MakeSentence(prefix.get()));
+          prefix.reset();
+        } else {
+          for (size_t candidate_idx = candidate_start;
+               candidate_idx < candidate_end; ++candidate_idx) {
+            auto* candidate =
+                new BeamNode<T>(cur_ids.data<int64_t>()[candidate_idx],
+                                cur_scores.data<T>()[candidate_idx]);
+            candidate->AppendTo(prefix.get());
+            beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(candidate));
+          }
+          prefix.release();
+        }
+      }
+    }
+    result.push_back(std::move(beam_nodes));
+  }
+  return result;
+}
+
+template <typename T>
+void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
+    std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
+    LoDTensor* score_tensor) const {
+  size_t src_num = sentence_vector_list.size();
+
+  PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0");
+
+  std::vector<size_t> source_level_lod = {0};
+  std::vector<size_t> sentence_level_lod = {0};
+  std::vector<int64_t> id_data;
+  std::vector<T> score_data;
+
+  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+    for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
+      id_data.insert(id_data.end(), sentence.word_ids.begin(),
+                     sentence.word_ids.end());
+      score_data.insert(score_data.end(), sentence.scores.begin(),
+                        sentence.scores.end());
+      sentence_level_lod.push_back(sentence_level_lod.back() +
+                                   sentence.word_ids.size());
+    }
+    source_level_lod.push_back(source_level_lod.back() +
+                               sentence_vector_list[src_idx].size());
+  }
+
+  auto cpu_place = new paddle::platform::CPUPlace();
+  paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);
+
+  framework::LoD lod;
+  lod.push_back(source_level_lod);
+  lod.push_back(sentence_level_lod);
+
+  id_tensor->set_lod(lod);
+  id_tensor->Resize({static_cast<int64_t>(id_data.size())});
+  id_tensor->mutable_data<int64_t>(paddle::platform::CPUPlace());
+  framework::CopyFromVector<int64_t>(id_data, cpu_ctx, id_tensor);
+
+  score_tensor->set_lod(lod);
+  score_tensor->Resize({static_cast<int64_t>(score_data.size())});
+  score_tensor->mutable_data<T>(paddle::platform::CPUPlace());
+  framework::CopyFromVector<T>(score_data, cpu_ctx, score_tensor);
+}
+
+template <typename T>
+void BeamSearchDecoder<T>::PackAllSteps(const LoDTensorArray& step_ids,
+                                        const LoDTensorArray& step_scores,
+                                        LoDTensor* id_tensor,
+                                        LoDTensor* score_tensor) const {
+  PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0");
+  PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(),
+                    "step_ids and step_scores should be the same");
+  const size_t step_num = step_ids.size();
+  const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
+
+  PADDLE_ENFORCE_GT(src_num, 0UL, "source num should be larger than 0");
+
+  // previous prefixes for each step,
+  // the init length is 0, means this is the first step.
+  std::vector<BeamNodeVector<T>> beamnode_vector_list(0);
+  std::vector<SentenceVector<T>> sentence_vector_list(src_num);
+
+  // pack all steps for one batch first, then another batch
+  for (size_t step_id = 0; step_id < step_num; ++step_id) {
+    beamnode_vector_list =
+        PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id),
+                     beamnode_vector_list, &sentence_vector_list);
+  }
+  // append last beam_node to result
+  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+    for (auto& beam_node : beamnode_vector_list.at(src_idx)) {
+      sentence_vector_list[src_idx].push_back(MakeSentence(beam_node.get()));
+      beam_node.reset();
+    }
+  }
+
+  ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor,
+                                   score_tensor);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/beam_search_decode_op_test.cc b/paddle/fluid/operators/beam_search_decode_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..24f87279d5eaa19715c31b2228c7a22d4723efae
--- /dev/null
+++ b/paddle/fluid/operators/beam_search_decode_op_test.cc
@@ -0,0 +1,221 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/beam_search_decode_op.h"
+#include "gtest/gtest.h"
+
+using CPUPlace = paddle::platform::CPUPlace;
+using LoD = paddle::framework::LoD;
+using LoDTensor = paddle::framework::LoDTensor;
+using LoDTensorArray = paddle::framework::LoDTensorArray;
+
+template <typename T>
+using BeamNode = paddle::operators::BeamNode<T>;
+template <typename T>
+using BeamSearchDecoder = paddle::operators::BeamSearchDecoder<T>;
+template <typename T>
+using Sentence = paddle::operators::Sentence<T>;
+template <typename T>
+using BeamNodeVector = paddle::operators::BeamNodeVector<T>;
+template <typename T>
+using SentenceVector = paddle::operators::SentenceVector<T>;
+
+namespace paddle {
+namespace test {
+
+void GenerateExample(const std::vector<size_t>& level_0,
+                     const std::vector<size_t>& level_1,
+                     const std::vector<int>& data, LoDTensorArray* ids,
+                     LoDTensorArray* scores) {
+  PADDLE_ENFORCE_EQ(level_0.back(), level_1.size() - 1,
+                    "source level is used to describe candidate set");
+  PADDLE_ENFORCE_EQ(level_1.back(), data.size(),
+                    "the lowest level is used to describe data"
+                    ", so it's last element should be data length");
+
+  CPUPlace place;
+
+  LoD lod;
+  lod.push_back(level_0);
+  lod.push_back(level_1);
+
+  // Ids
+  LoDTensor tensor_id;
+  tensor_id.set_lod(lod);
+  tensor_id.Resize({static_cast<int64_t>(data.size())});
+  // malloc memory
+  int64_t* id_ptr = tensor_id.mutable_data<int64_t>(place);
+  for (size_t i = 0; i < data.size(); ++i) {
+    id_ptr[i] = static_cast<int64_t>(data.at(i));
+  }
+
+  // Scores
+  LoDTensor tensor_score;
+  tensor_score.set_lod(lod);
+  tensor_score.Resize({static_cast<int64_t>(data.size())});
+  // malloc memory
+  float* score_ptr = tensor_score.mutable_data<float>(place);
+  for (size_t i = 0; i < data.size(); ++i) {
+    score_ptr[i] = static_cast<float>(data.at(i));
+  }
+
+  ids->push_back(tensor_id);
+  scores->push_back(tensor_score);
+}
+
+}  // namespace test
+}  // namespace paddle
+
+TEST(BeamSearchDecodeOp, DeleteBeamNode) {
+  auto* root = new BeamNode<float>(0, 0);
+  auto* b1 = new BeamNode<float>(1, 1);
+  auto* b2 = new BeamNode<float>(2, 2);
+  auto* b3 = new BeamNode<float>(3, 3);
+
+  b1->AppendTo(root);
+  b2->AppendTo(root);
+  b3->AppendTo(b1);
+
+  delete b3;
+  delete b2;
+}
+
+TEST(BeamSearchDecodeOp, MakeSentence) {
+  auto* root = new BeamNode<float>(0, 0);
+  auto* b1 = new BeamNode<float>(1, 1);
+  auto* end = new BeamNode<float>(2, 2);
+  b1->AppendTo(root);
+  end->AppendTo(b1);
+
+  BeamSearchDecoder<float> helper;
+  Sentence<float> sentence = helper.MakeSentence(end);
+  delete end;
+
+  std::vector<int64_t> expect_ids = {0, 1, 2};
+  ASSERT_EQ(sentence.word_ids, expect_ids);
+
+  std::vector<float> expect_scores = {0, 1, 2};
+  ASSERT_EQ(sentence.scores, expect_scores);
+}
+
+TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) {
+  CPUPlace place;
+
+  LoDTensorArray ids;
+  LoDTensorArray scores;
+
+  paddle::test::GenerateExample(
+      std::vector<size_t>{0, 2, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
+      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
+
+  std::vector<BeamNodeVector<float>> beamnode_vector_list;
+  std::vector<SentenceVector<float>> sentence_vector_list(
+      2, SentenceVector<float>());
+
+  BeamSearchDecoder<float> helper;
+  beamnode_vector_list = helper.PackTwoSteps(
+      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
+  ASSERT_EQ(beamnode_vector_list.size(), 2UL);
+  ASSERT_EQ(beamnode_vector_list[0].size(), 2UL);
+  ASSERT_EQ(beamnode_vector_list[1].size(), 4UL);
+}
+
+TEST(BeamSearchDecodeOp, PackTwoSteps) {
+  CPUPlace place;
+
+  // first source has three prefix
+  BeamNodeVector<float> source0_prefixes;
+  source0_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(1, 1)));
+  source0_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(0, 0)));
+  source0_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(3, 3)));
+
+  // second source has two prefix
+  BeamNodeVector<float> source1_prefixes;
+  source1_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(4, 4)));
+  source1_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(5, 5)));
+
+  std::vector<BeamNodeVector<float>> beamnode_vector_list;
+  std::vector<SentenceVector<float>> sentence_vector_list(
+      2, SentenceVector<float>());
+
+  beamnode_vector_list.push_back(std::move(source0_prefixes));
+  beamnode_vector_list.push_back(std::move(source1_prefixes));
+
+  // generate data for one step
+  LoDTensorArray ids;
+  LoDTensorArray scores;
+
+  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 5},
+                                std::vector<size_t>{0, 1, 1, 3, 4, 5},
+                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
+
+  BeamSearchDecoder<float> helper1;
+  beamnode_vector_list = helper1.PackTwoSteps(
+      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
+
+  ASSERT_EQ(sentence_vector_list[0].size(), 1UL);
+  ASSERT_EQ(sentence_vector_list[1].size(), 0UL);
+  ASSERT_EQ(beamnode_vector_list[0].size(), 3UL);
+  ASSERT_EQ(beamnode_vector_list[1].size(), 2UL);
+}
+
+TEST(BeamSearchDecodeOp, PackAllSteps) {
+  CPUPlace place;
+
+  // we will constuct a sample data with 3 steps and 2 source sentences
+  LoDTensorArray ids;
+  LoDTensorArray scores;
+
+  paddle::test::GenerateExample(
+      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
+      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
+  paddle::test::GenerateExample(
+      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 1, 3, 5, 5, 6},
+      std::vector<int>{0, 1, 2, 3, 4, 5}, &ids, &scores);
+  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 6},
+                                std::vector<size_t>{0, 0, 1, 2, 3, 4, 5},
+                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
+
+  ASSERT_EQ(ids.size(), 3UL);
+  ASSERT_EQ(scores.size(), 3UL);
+
+  BeamSearchDecoder<float> helper;
+
+  LoDTensor id_tensor;
+  LoDTensor score_tensor;
+  helper.PackAllSteps(ids, scores, &id_tensor, &score_tensor);
+
+  LoD lod = id_tensor.lod();
+  std::vector<size_t> expect_source_lod = {0, 4, 8};
+  EXPECT_EQ(lod[0], expect_source_lod);
+  std::vector<size_t> expect_sentence_lod = {0, 1, 3, 6, 9, 10, 13, 16, 19};
+  EXPECT_EQ(lod[1], expect_sentence_lod);
+  // 2| 1, 0| 3, 1, 0| 3, 2, 1| 5| 4, 3, 2| 4, 4, 3| 6, 5, 4
+  std::vector<int> expect_data = {2, 1, 0, 3, 1, 0, 3, 2, 1, 5,
+                                  4, 3, 2, 4, 4, 3, 6, 5, 4};
+  ASSERT_EQ(id_tensor.dims()[0], static_cast<int64_t>(expect_data.size()));
+  for (size_t i = 0; i < expect_data.size(); ++i) {
+    ASSERT_EQ(id_tensor.data<int64_t>()[i],
+              static_cast<int64_t>(expect_data[i]));
+  }
+  for (int64_t i = 0; i < id_tensor.dims()[0]; ++i) {
+    ASSERT_EQ(score_tensor.data<float>()[i],
+              static_cast<float>(id_tensor.data<int64_t>()[i]));
+  }
+}
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6f4c8c7e06ee17b4cf3880db7bc8ddfbb88df3b8
--- /dev/null
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -0,0 +1,258 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/beam_search_op.h"
+
+#include <map>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
+                            framework::LoDTensor *selected_ids,
+                            framework::LoDTensor *selected_scores) {
+  auto abs_lod = framework::ToAbsOffset(ids_->lod());
+  auto &high_level = abs_lod[lod_level_];
+
+  auto items = SelectTopBeamSizeItems();
+  auto selected_items = ToMap(items, high_level.back());
+  VLOG(3) << "selected_items:";
+  for (size_t i = 0; i < selected_items.size(); ++i) {
+    VLOG(3) << "offset:" << i;
+    for (auto &item : selected_items[i]) {
+      VLOG(3) << ItemToString(item);
+    }
+  }
+  PruneEndidCandidates(pre_ids, &selected_items);
+  // calculate the output tensor's height
+  size_t num_instances = std::accumulate(
+      std::begin(selected_items), std::end(selected_items), 0,
+      [](size_t a, std::vector<Item> &b) { return a + b.size(); });
+  // the output tensor shape should be [num_instances, 1]
+  auto dims = framework::make_ddim(
+      std::vector<int64_t>({static_cast<int>(num_instances), 1}));
+  selected_ids->Resize(dims);
+  selected_scores->Resize(dims);
+
+  std::map<size_t /*offset*/, std::vector<Item>> hash;
+  framework::LoD new_lod;
+  auto *ids_data = selected_ids->mutable_data<int64_t>(platform::CPUPlace());
+  auto *scores_data =
+      selected_scores->mutable_data<float>(platform::CPUPlace());
+
+  // fill in data
+  std::vector<size_t> low_level;
+  size_t low_offset = 0;
+  for (auto &items : selected_items) {
+    low_level.push_back(low_offset);
+    sort(items.begin(), items.end(), [](const Item &a, const Item &b) {
+      if (a.offset < b.offset) {
+        return true;
+      }
+      return a.id < b.id;
+    });
+    for (auto &item : items) {
+      ids_data[low_offset] = item.id;
+      scores_data[low_offset] = item.score;
+      low_offset++;
+    }
+  }
+  low_level.push_back(low_offset);
+
+  // fill lod
+  framework::LoD lod(2);
+  lod[0].assign(high_level.begin(), high_level.end());
+  lod[1].assign(low_level.begin(), low_level.end());
+  if (!framework::CheckLoD(lod)) {
+    PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
+  }
+  selected_ids->set_lod(lod);
+  selected_scores->set_lod(lod);
+}
+
+int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
+                                     std::vector<std::vector<Item>> *items) {
+  auto *pre_ids_data = pre_ids.data<int64_t>();
+
+  int res = 0;
+  for (size_t offset = 0; offset < items->size(); offset++) {
+    auto prefix_id = pre_ids_data[offset];
+    if (prefix_id == end_id_) {
+      items->at(offset).clear();
+    } else {
+      res++;
+    }
+  }
+
+  return res;
+}
+
+std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
+    const std::vector<std::vector<Item>> &items, size_t element_num) {
+  std::vector<std::vector<Item>> result;
+  result.resize(element_num);
+  for (auto &entries : items) {
+    for (const auto &item : entries) {
+      result[item.offset].push_back(item);
+    }
+  }
+  return result;
+}
+
+std::vector<std::vector<BeamSearch::Item>>
+BeamSearch::SelectTopBeamSizeItems() {
+  std::vector<std::vector<Item>> result;
+  std::vector<Item> items;
+  // for each source sentence, select the top beam_size items across all
+  // candidate sets.
+  while (NextItemSet(&items)) {
+    std::nth_element(std::begin(items), std::begin(items) + beam_size_,
+                     std::end(items), [](const Item &a, const Item &b) {
+                       // TODO(superjom) make score's comparation customizable.
+                       // partial sort in descending order
+                       return a.score > b.score;
+                     });
+    // prune the top beam_size items.
+    if (items.size() > beam_size_) {
+      items.resize(beam_size_);
+    }
+    result.emplace_back(items);
+  }
+  VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
+  for (auto &items : result) {
+    VLOG(3) << "item set:";
+    for (auto &item : items) {
+      VLOG(3) << ItemToString(item);
+    }
+  }
+
+  return result;
+}
+
+// the candidates of a source
+bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
+  if (sent_offset_ >= ids_->NumElements(lod_level_)) {
+    return false;
+  }
+  // find the current candidates
+  auto ids = *ids_;
+  auto scores = *scores_;
+
+  auto abs_lod = framework::ToAbsOffset(ids.lod());
+
+  auto *ids_data = ids.data<int64_t>();
+  auto *scores_data = scores.data<float>();
+
+  size_t instance_dim = 1;
+  for (int i = 1; i < ids.dims().size(); i++) {
+    instance_dim *= ids.dims()[i];
+  }
+
+  items->clear();
+  items->reserve(framework::product(ids.dims()));
+  for (size_t offset = abs_lod[lod_level_][sent_offset_];
+       offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
+    for (size_t d = 0; d < instance_dim; d++) {
+      const size_t dim_offset = offset * instance_dim + d;
+      items->emplace_back(offset, ids_data[dim_offset],
+                          scores_data[dim_offset]);
+    }
+  }
+
+  sent_offset_++;
+  return true;
+}
+
+std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) {
+  os << "{";
+  os << "offset: " << item.offset << ", ";
+  os << "id: " << item.id << ", ";
+  os << "score: " << item.score << "";
+  os << "}";
+
+  return os;
+}
+
+std::string ItemToString(const BeamSearch::Item &item) {
+  std::ostringstream stream;
+  stream << item;
+  return stream.str();
+}
+
+class BeamSearchProtoAndCheckerMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  BeamSearchProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    // inputs and outputs stored in proto
+    AddInput("pre_ids", "ids in previous step");
+    AddInput("ids", "a LoDTensor of shape of [None,k]");
+    AddInput("scores",
+             "a LoDTensor that has the same shape and LoD with `ids`");
+    AddOutput("selected_ids",
+              "a LoDTensor that stores the IDs selected by beam search");
+    AddOutput(
+        "selected_scores",
+        "a LoDTensor that has the same shape and LoD with `selected_ids`");
+
+    // Attributes stored in AttributeMap
+    AddAttr<int>("level", "the level of LoDTensor");
+    AddAttr<int>("beam_size", "beam size for beam search");
+    AddAttr<int>("end_id",
+                 "the token id which indicates the end of a sequence");
+
+    AddComment(
+        "This is a beam search operator that help to generate sequences.");
+  }
+};
+
+class BeamSearchInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    for (const std::string &arg :
+         std::vector<std::string>({"pre_ids", "ids", "scores"})) {
+      PADDLE_ENFORCE(context->HasInput(arg),
+                     "BeamSearch need input argument '%s'", arg);
+    }
+    for (const std::string &arg :
+         std::vector<std::string>({"selected_ids", "selected_scores"})) {
+      PADDLE_ENFORCE(context->HasOutput(arg),
+                     "BeamSearch need output argument '%s'", arg);
+    }
+  }
+};
+
+class BeamSearchInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    for (auto &o : op_desc.Output("selected_ids")) {
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
+    }
+    for (auto &o : op_desc.Output("selected_scores")) {
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(beam_search, paddle::operators::BeamSearchOp,
+                  paddle::operators::BeamSearchProtoAndCheckerMaker,
+                  paddle::operators::BeamSearchInferShape,
+                  paddle::operators::BeamSearchInferVarType,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e2a05a60c30e388093aceddd40e58273364c8f9
--- /dev/null
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -0,0 +1,237 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_TESTING
+#include "gtest/gtest.h"
+#endif
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+/*
+ * This is an implementation of beam search.
+ *
+ * To explain the details, lets take machine translation task for example, in
+ * this task, one source sentence is translated to multiple target sentences,
+ * during this period, one sentence will be translated to multiple translation
+ * prefixes(target sentence that have not ended), in each time step a prefix
+ * will have some candidates, input the candidate ids and their corresponding
+ * scores (probabilities), it will sort and select the top beam_size candidates
+ * for each source sentence, and store the selected candidates's score and their
+ * corresponding ids to LoDTensors.
+ *
+ * A detailed example:
+ *
+ * Input
+ *
+ * ids:
+ * LoD (should have 2 levels)
+ * first level: [0, 1, 4]
+ * second level: [0, 1, 2, 3, 4]
+ *
+ * tensor's data
+ * [
+ * [4, 2, 5]
+ * [2, 1, 3]
+ * [3, 5, 2]
+ * [8, 2, 1]
+ * ]
+ *
+ * scores:
+ * LoD same as `ids`
+ * tensor's data
+ * [
+ * [0.5, 0.3, 0.2]
+ * [0.6, 0.3, 0.1]
+ * [0.9, 0.5, 0.1]
+ * [0.7, 0.5, 0.1]
+ * ]
+ *
+ * the inputs means that there are 2 source sentences to translate, and the
+ * first source has 1 prefix, the second source has 2 prefix.
+ *
+ * lets assume beam size is 2, and the beam search's output should be
+ * LoD
+ * first level:
+ * [0, 1, 2]
+ * second level:
+ * [0, 2, 4]
+ *
+ * id tensor's data
+ * [[
+ * 4,
+ * 1,
+ * 3,
+ * 8,
+ * ]]
+ *
+ * score tensor's data
+ * [[
+ * 0.5,
+ * 0.3,
+ * 0.9,
+ * 0.7
+ * ]]
+ *
+ * TODO all the prune operations should be in the beam search, so it is better
+ * to split the beam search algorithm into a sequence of smaller operators, and
+ * the prune operators can be inserted in this sequence.
+ */
+class BeamSearch {
+ public:
+  // TODO(superjom) make type customizable
+  using id_t = size_t;
+  using score_t = float;
+  /*
+   * Input the arguments that needed by this class.
+   */
+  BeamSearch(const framework::LoDTensor& ids,
+             const framework::LoDTensor& scores, size_t level, size_t beam_size,
+             int end_id)
+      : beam_size_(beam_size),
+        ids_(&ids),
+        scores_(&scores),
+        lod_level_(level),
+        end_id_(end_id) {}
+
+  /*
+   * The main function of beam search.
+   *
+   * @selected_ids: a [None, 1]-shaped tensor with LoD.
+   *   In a machine translation model, it might be the candidate term id sets,
+   *   each set stored as a varience-length sequence.
+   *   The format might be described with a two-level LoD
+   *   - [[0 1]
+   *   -  [0 1 2]]
+   *   - [[]
+   *   -  [0 1]]
+   *   the first level of LoD tells that there are two source sentences. The
+   *   second level describes the details of the candidate id set's offsets in
+   * the
+   *   source sentences.
+   *
+   *  @selected_scores: a LoD tensor with the same shape and LoD with
+   * selected_ids.
+   *   It stores the corresponding scores of candidate ids in selected_ids.
+   *
+   * Return false if all the input tensor is empty, in machine translation task
+   * that means no candidates is provided, and the task will stop running.
+   */
+  void operator()(const framework::LoDTensor& pre_ids,
+                  framework::LoDTensor* selected_ids,
+                  framework::LoDTensor* selected_scores);
+  /*
+   * The basic items help to sort.
+   */
+  struct Item {
+    Item() {}
+    Item(size_t offset, size_t id, float score)
+        : offset(offset), id(id), score(score) {}
+    // offset in the higher lod level.
+    size_t offset;
+    // // prefix id in the lower lod level.
+    // size_t prefix;
+    // the candidate id
+    id_t id;
+    // the corresponding score
+    score_t score;
+  };
+
+ protected:
+  /*
+   * Delete all the records that follows the end token.
+   */
+  int PruneEndidCandidates(const framework::LoDTensor& pre_ids,
+                           std::vector<std::vector<Item>>* items);
+
+  /*
+   * Transform the items into a map whose key is offset, value is the items.
+   * NOTE low performance
+   */
+  std::vector<std::vector<Item>> ToMap(
+      const std::vector<std::vector<Item>>& inputs, size_t element_num);
+
+  /*
+   * For each source, select top beam_size records.
+   */
+  std::vector<std::vector<Item>> SelectTopBeamSizeItems();
+
+  /*
+   * Get the items of next source sequence, return false if no remaining items.
+   */
+  bool NextItemSet(std::vector<Item>* items);
+
+ private:
+  size_t beam_size_;
+  const framework::LoDTensor* ids_;
+  const framework::LoDTensor* scores_;
+  size_t lod_level_{0};
+  size_t sent_offset_{0};
+  int end_id_{0};
+};
+
+std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item);
+
+std::string ItemToString(const BeamSearch::Item& item);
+
+class BeamSearchOp : public framework::OperatorBase {
+ public:
+  BeamSearchOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  BeamSearchOp(const BeamSearchOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    PADDLE_THROW("Not Implemented");
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
+    auto ids_var = scope.FindVar(Input("ids"));
+    auto scores_var = scope.FindVar(Input("scores"));
+    auto pre_ids_var = scope.FindVar(Input("pre_ids"));
+    PADDLE_ENFORCE_NOT_NULL(ids_var);
+    PADDLE_ENFORCE_NOT_NULL(scores_var);
+    PADDLE_ENFORCE_NOT_NULL(pre_ids_var);
+
+    auto& ids = ids_var->Get<framework::LoDTensor>();
+    auto& scores = scores_var->Get<framework::LoDTensor>();
+    auto& pre_ids = pre_ids_var->Get<framework::LoDTensor>();
+    size_t level = Attr<int>("level");
+    size_t beam_size = Attr<int>("beam_size");
+    int end_id = Attr<int>("end_id");
+    BeamSearch alg(ids, scores, level, beam_size, end_id);
+
+    auto selected_ids_var = scope.FindVar(Output("selected_ids"));
+    auto selected_scores_var = scope.FindVar(Output("selected_scores"));
+    PADDLE_ENFORCE_NOT_NULL(selected_ids_var);
+    PADDLE_ENFORCE_NOT_NULL(selected_scores_var);
+    auto& selected_ids_tensor =
+        *selected_ids_var->GetMutable<framework::LoDTensor>();
+    auto& selected_scores_tensor =
+        *selected_scores_var->GetMutable<framework::LoDTensor>();
+    alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea2afda4d492ccde8889a394201b398eeff3badb
--- /dev/null
+++ b/paddle/fluid/operators/beam_search_op_test.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/operators/beam_search_op.h"
+
+#include <gtest/gtest.h>
+#include <vector>
+
+namespace paddle {
+namespace test {
+
+using std::vector;
+using framework::LoDTensor;
+using framework::LoD;
+using operators::BeamSearch;
+using paddle::platform::CPUPlace;
+using std::cout;
+using std::endl;
+
+void CreateInput(LoDTensor* ids, LoDTensor* scores) {
+  LoD lod;
+  vector<size_t> level0({0, 1, 4});
+  vector<size_t> level1({0, 1, 2, 3, 4});
+  lod.push_back(level0);
+  lod.push_back(level1);
+  ids->set_lod(lod);
+  scores->set_lod(lod);
+
+  auto dims = framework::make_ddim(vector<int64_t>({4, 3}));
+  ids->Resize(dims);
+  scores->Resize(dims);
+  CPUPlace place;
+
+  auto* ids_data = ids->mutable_data<int64_t>(place);
+  auto* scores_data = scores->mutable_data<float>(place);
+  vector<int64_t> _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
+  vector<float> _scores(
+      {0.5, 0.3, 0.2, 0.6, 0.3, 0.1, 0.9, 0.5, 0.1, 0.7, 0.5, 0.1});
+
+  for (int i = 0; i < 12; i++) {
+    ids_data[i] = _ids[i];
+    scores_data[i] = _scores[i];
+  }
+}
+
+TEST(beam_search_op, run) {
+  CPUPlace place;
+  LoDTensor ids, scores;
+  CreateInput(&ids, &scores);
+
+  LoDTensor pre_ids;
+  pre_ids.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
+  for (int i = 0; i < 4; i++) {
+    pre_ids.mutable_data<int64_t>(place)[i] = i + 1;
+  }
+
+  BeamSearch beamsearch(ids, scores, (int64_t)0, (int64_t)2, 0);
+  LoDTensor sids, sscores;
+  beamsearch(pre_ids, &sids, &sscores);
+
+  LOG(INFO) << "score: " << sscores << endl;
+
+  ASSERT_EQ(sids.lod(), sscores.lod());
+
+  vector<int> tids({2, 4, 3, 8});
+  vector<float> tscores({0.3, 0.5, 0.9, 0.7});
+
+  for (int i = 0; i < 4; i++) {
+    ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);
+    ASSERT_EQ(tscores[i], sscores.data<float>()[i]);
+  }
+}
+
+}  // namespace test
+}  // namespace paddle
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cc378b1b4536273e4364a488eb7a4ca2cc706782
--- /dev/null
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc
@@ -0,0 +1,169 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/bilinear_tensor_product_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class BilinearTensorProductOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto weight_dims = ctx->GetInputDim("Weight");
+
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The input(X) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The input(Y) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(weight_dims.size(), 3UL,
+                      "The input(Weight) must be a 3D tensor.");
+    PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
+                      "The first dimension(batch_size) of input(X) must be "
+                      "equal to the first dimension of the input(Y).");
+    PADDLE_ENFORCE_EQ(x_dims[1], weight_dims[1],
+                      "The second dimension of input(X) must be equal to "
+                      "the second dimension of the input(Weight).");
+    PADDLE_ENFORCE_EQ(y_dims[1], weight_dims[2],
+                      "The second dimension of input(Y) must be equal to "
+                      "the third dimension of the input(Weight).");
+
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      PADDLE_ENFORCE(bias_dims.size() == 2UL && bias_dims[0] == 1UL,
+                     "The Input(Bias) must be a 2-D tensor with "
+                     "the 2nd dimension fixed to 1 (a row vector).");
+      PADDLE_ENFORCE_EQ(bias_dims[1], weight_dims[0],
+                        "The second dimension of input(Bias) must be equal "
+                        "to the first dimension of the input(Weight).");
+    }
+
+    ctx->SetOutputDim("Out", {x_dims[0], weight_dims[0]});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BilinearTensorProductOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of bilinear_tensor_product operator.");
+    AddInput("Y", "The second input of bilinear_tensor_product operator.");
+    AddInput("Weight",
+             "The learnable parameters of bilinear_tensor_product operator.");
+    AddInput("Bias", "The learnable bias of bilinear_tensor_product operator.")
+        .AsDispensable();
+    AddOutput("Out", "The output of bilinear_tensor_product operator.");
+    AddComment(R"DOC(
+Bilinear Tensor Product operator.
+Given input X and Y, a 3D tensor Weight and a Bias. Each column of the
+Output is computed by one slice $i = 1, . . . , k$ of the tensor:
+
+$$
+M =  (X W_i) * Y \\
+Out_i = \sum_j {M_j} + Bias_i
+$$
+
+Where $W_i$ is the $i$-th slice of Input(Weight);
+      $M_j$ is the $j$-th column of $M$;
+      $Out_i$ is the $i$-th column of Output(Out);
+      $Bias_i$ is a column vector, each element of it is equal to
+        the $i$-th element of $Bias$;
+
+)DOC");
+  }
+};
+
+class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_EQ(out_dims.size(), 2UL,
+                      "The input(Out@GRAD) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(
+        x_dims[0], out_dims[0],
+        "The first dimension(batch_size) of input(Out@GRAD) must be "
+        "equal to the first dimension of the Input(X).");
+    PADDLE_ENFORCE_EQ(
+        weight_dims[0], out_dims[1],
+        "The second dimension of input(Out@GRAD) must be equal to "
+        "the third dimension of the Input(Weight).");
+
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      PADDLE_ENFORCE_EQ(
+          bias_dims[1], out_dims[1],
+          "The second dimension of input(Out@GRAD) must be equal to "
+          "the second dimension of the Input(Bias).");
+      auto bias_grad_name = framework::GradVarName("Bias");
+      if (ctx->HasOutput(bias_grad_name))
+        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    auto weight_grad_name = framework::GradVarName("Weight");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+    if (ctx->HasOutput(weight_grad_name)) {
+      ctx->SetOutputDim(weight_grad_name, weight_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp,
+            ops::BilinearTensorProductOpMaker, bilinear_tensor_product_grad,
+            ops::BilinearTensorProductOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    bilinear_tensor_product,
+    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
+REGISTER_OP_CPU_KERNEL(
+    bilinear_tensor_product_grad,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUDeviceContext,
+                                         float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUDeviceContext,
+                                         double>);
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cu b/paddle/fluid/operators/bilinear_tensor_product_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2cec48ee69ac500c0b0ba84f4b6fc20415f4b82c
--- /dev/null
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/bilinear_tensor_product_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    bilinear_tensor_product,
+    ops::BilinearTensorProductKernel<paddle::platform::CUDADeviceContext,
+                                     float>,
+    ops::BilinearTensorProductKernel<paddle::platform::CUDADeviceContext,
+                                     double>);
+REGISTER_OP_CUDA_KERNEL(
+    bilinear_tensor_product_grad,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CUDADeviceContext,
+                                         float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CUDADeviceContext,
+                                         double>);
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.h b/paddle/fluid/operators/bilinear_tensor_product_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..626fa957c42c02c978519c1869cd5f0679d22a26
--- /dev/null
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.h
@@ -0,0 +1,185 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class BilinearTensorProductKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto y_mat = EigenMatrix<T>::From(*y);
+    auto output_mat = EigenMatrix<T>::From(*out);
+
+    auto batch_size = x->dims()[0];
+    auto weight_dims = weight->dims();
+    int out_dim = weight_dims[0];
+    auto x_dim = weight_dims[1];
+    auto y_dim = weight_dims[2];
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    // Create the intermediate variable to caculate the result of
+    // Input(X) multiplied by Input(Weight_i), the formula is:
+    // left_mul = X Weight_i.
+    Tensor left_mul;
+    left_mul.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
+                             ctx.GetPlace());
+    auto left_mul_mat = EigenMatrix<T>::From(left_mul);
+
+    for (int i = 0; i < out_dim; ++i) {
+      auto output_col_vec = output_mat.chip(i, 1);
+      Tensor weight_mat =
+          weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
+      math::gemm<DeviceContext, T>(dev_ctx, CblasNoTrans, CblasNoTrans,
+                                   batch_size, y_dim, x_dim, 1, x->data<T>(),
+                                   weight_mat.data<T>(), 0, left_mul.data<T>());
+      output_col_vec.device(place) =
+          (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
+    }
+    if (bias) {
+      auto bias_vec = EigenMatrix<T>::From(*bias);
+      Eigen::DSizes<int, 2> bcast(batch_size, 1);
+      output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat;
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* y = ctx.Input<Tensor>("Y");
+    const Tensor* weight = ctx.Input<Tensor>("Weight");
+    Tensor* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    Tensor* d_y = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    Tensor* d_weight = ctx.Output<Tensor>(framework::GradVarName("Weight"));
+    Tensor* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    const Tensor* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto batch_size = x->dims()[0];
+    auto weight_dims = weight->dims();
+    int out_dim = weight_dims[0];
+    auto x_dim = weight_dims[1];
+    auto y_dim = weight_dims[2];
+
+    auto x_mat = EigenMatrix<T>::From(*x);
+    auto y_mat = EigenMatrix<T>::From(*y);
+    auto d_out_mat = EigenMatrix<T>::From(*d_out);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    // Create the intermediate variable to caculate the Output(Y@Grad).
+    Tensor x_scale;
+    x_scale.mutable_data<T>(framework::make_ddim({batch_size, x_dim}),
+                            ctx.GetPlace());
+    auto x_scale_mat = EigenMatrix<T>::From(x_scale);
+
+    // Create the intermediate variable to caculate the Output(X@Grad).
+    Tensor y_scale;
+    y_scale.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
+                            ctx.GetPlace());
+    auto y_scale_mat = EigenMatrix<T>::From(y_scale);
+
+    math::SetConstant<DeviceContext, T> set_zero;
+
+    // Set Output(X@Grad) be zero.
+    if (d_x) {
+      d_x->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_x, static_cast<T>(0));
+    }
+
+    // Set Output(Y@Grad) be zero.
+    if (d_y) {
+      d_y->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_y, static_cast<T>(0));
+    }
+
+    // Caculate the Output(X@Grad) and Output(Y@Grad).
+    if (d_x || d_y) {
+      Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
+      Eigen::DSizes<int, 2> bcast_for_y(1, x_dim);
+      for (int i = 0; i < out_dim; ++i) {
+        Tensor weight_i = weight->Slice(i, i + 1).Resize(
+            framework::make_ddim({x_dim, y_dim}));
+        auto output_vec = d_out_mat.chip(i, 1);
+        if (d_x) {
+          y_scale_mat.device(place) =
+              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                  .broadcast(bcast_for_x) *
+              y_mat;
+          math::gemm<DeviceContext, T>(
+              dev_ctx, CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
+              y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
+        }
+        if (d_y) {
+          x_scale_mat.device(place) =
+              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                  .broadcast(bcast_for_y) *
+              x_mat;
+          math::gemm<DeviceContext, T>(
+              dev_ctx, CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
+              x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
+        }
+      }
+    }
+
+    // Caculate the gradient of Input(Weight).
+    if (d_weight) {
+      d_weight->mutable_data<T>(ctx.GetPlace());
+      Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
+      for (int i = 0; i < out_dim; ++i) {
+        Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize(
+            framework::make_ddim({x_dim, y_dim}));
+        auto output_vec = d_out_mat.chip(i, 1);
+        x_scale_mat.device(place) =
+            output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                .broadcast(bcast_for_weight) *
+            x_mat;
+        math::gemm<DeviceContext, T>(dev_ctx, CblasTrans, CblasNoTrans, x_dim,
+                                     y_dim, batch_size, 1, x_scale.data<T>(),
+                                     y->data<T>(), 0, d_weight_i.data<T>());
+      }
+    }
+
+    // Caculate the gradient of Input(Bias).
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      auto d_bias_mat = framework::EigenVector<T>::Flatten(*d_bias);
+      d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes<int, 1>(0));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/bipartite_match_op.cc b/paddle/fluid/operators/bipartite_match_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d614bf704382da7743b2128fa57a147e8db33d24
--- /dev/null
+++ b/paddle/fluid/operators/bipartite_match_op.cc
@@ -0,0 +1,195 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+class BipartiteMatchOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("DistMat"),
+                   "Input(DistMat) of BipartiteMatch should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("ColToRowMatchIndices"),
+        "Output(ColToRowMatchIndices) of BipartiteMatch should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("ColToRowMatchDist"),
+        "Output(ColToRowMatchDist) of BipartiteMatch should not be null.");
+
+    auto dims = ctx->GetInputDim("DistMat");
+    PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2.");
+
+    ctx->SetOutputDim("ColToRowMatchIndices", dims);
+    ctx->SetOutputDim("ColToRowMatchDist", dims);
+  }
+};
+
+template <typename T>
+class BipartiteMatchKernel : public framework::OpKernel<T> {
+ public:
+  // The match_indices must be initialized to -1 at first.
+  // The match_dist must be initialized to 0 at first.
+  void BipartiteMatch(const Tensor& dist, int* match_indices,
+                      T* match_dist) const {
+    constexpr T kEPS = static_cast<T>(1e-6);
+    PADDLE_ENFORCE_EQ(dist.dims().size(), 2, "The rank of dist must be 2.");
+    int64_t row = dist.dims()[0];
+    int64_t col = dist.dims()[1];
+    auto* dist_data = dist.data<T>();
+    std::vector<int> row_pool;
+    for (int i = 0; i < row; ++i) {
+      row_pool.push_back(i);
+    }
+    while (row_pool.size() > 0) {
+      int max_idx = -1;
+      int max_row_idx = -1;
+      T max_dist = -1;
+      for (int64_t j = 0; j < col; ++j) {
+        if (match_indices[j] != -1) {
+          continue;
+        }
+        for (size_t k = 0; k < row_pool.size(); ++k) {
+          int m = row_pool[k];
+          // distance is 0 between m-th row and j-th column
+          if (dist_data[m * col + j] < kEPS) {
+            continue;
+          }
+          if (dist_data[m * col + j] > max_dist) {
+            max_idx = j;
+            max_row_idx = m;
+            max_dist = dist_data[m * col + j];
+          }
+        }
+      }
+      if (max_idx == -1) {
+        // Cannot find good match.
+        break;
+      } else {
+        PADDLE_ENFORCE_EQ(match_indices[max_idx], -1);
+        match_indices[max_idx] = max_row_idx;
+        match_dist[max_idx] = max_dist;
+        // Erase the row index.
+        row_pool.erase(
+            std::find(row_pool.begin(), row_pool.end(), max_row_idx));
+      }
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* dist_mat = context.Input<LoDTensor>("DistMat");
+    auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
+    auto* match_dist = context.Output<Tensor>("ColToRowMatchDist");
+
+    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
+
+    auto col = dist_mat->dims()[1];
+
+    int64_t n = dist_mat->lod().size() == 0UL
+                    ? 1
+                    : static_cast<int64_t>(dist_mat->lod().back().size() - 1);
+    if (dist_mat->lod().size()) {
+      PADDLE_ENFORCE_EQ(dist_mat->lod().size(), 1UL,
+                        "Only support 1 level of LoD.");
+    }
+    match_indices->mutable_data<int>({n, col}, context.GetPlace());
+    match_dist->mutable_data<T>({n, col}, context.GetPlace());
+
+    math::SetConstant<platform::CPUDeviceContext, int> iset;
+    iset(dev_ctx, match_indices, static_cast<int>(-1));
+    math::SetConstant<platform::CPUDeviceContext, T> tset;
+    tset(dev_ctx, match_dist, static_cast<T>(0));
+
+    int* indices = match_indices->data<int>();
+    T* dist = match_dist->data<T>();
+    if (n == 1) {
+      BipartiteMatch(*dist_mat, indices, dist);
+    } else {
+      auto lod = dist_mat->lod().back();
+      for (size_t i = 0; i < lod.size() - 1; ++i) {
+        Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
+        BipartiteMatch(one_ins, indices + i * col, dist + i * col);
+      }
+    }
+  }
+};
+
+class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BipartiteMatchOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "DistMat",
+        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
+        "[K, M]. It is pair-wise distance matrix between the entities "
+        "represented by each row and each column. For example, assumed one "
+        "entity is A with shape [K], another entity is B with shape [M]. The "
+        "DistMat[i][j] is the distance between A[i] and B[j]. The bigger "
+        "the distance is, the better macthing the pairs are. Please note, "
+        "This tensor can contain LoD information to represent a batch of "
+        "inputs. One instance of this batch can contain different numbers of "
+        "entities.");
+    AddOutput("ColToRowMatchIndices",
+              "(Tensor) A 2-D Tensor with shape [N, M] in int type. "
+              "N is the batch size. If ColToRowMatchIndices[i][j] is -1, it "
+              "means B[j] does not match any entity in i-th instance. "
+              "Otherwise, it means B[j] is matched to row "
+              "ColToRowMatchIndices[i][j] in i-th instance. The row number of "
+              "i-th instance is saved in ColToRowMatchIndices[i][j].");
+    AddOutput("ColToRowMatchDist",
+              "(Tensor) A 2-D Tensor with shape [N, M] in float type. "
+              "N is batch size. If ColToRowMatchIndices[i][j] is -1, "
+              "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed "
+              "ColToRowMatchIndices[i][j] = d, and the row offsets of each "
+              "instance are called LoD. Then "
+              "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]");
+    AddComment(R"DOC(
+This operator is a greedy bipartite matching algorithm, which is used to
+obtain the matching with the maximum distance based on the input
+distance matrix. For input 2D matrix, the bipartite matching algorithm can
+find the matched column for each row, also can find the matched row for
+each column. And this operator only calculate matched indices from column
+to row. For each instance, the number of matched indices is the number of
+of columns of the input ditance matrix.
+
+There are two outputs to save matched indices and distance.
+A simple description, this algothrim matched the best (maximum distance)
+row entity to the column entity and the matched indices are not duplicated
+in each row of ColToRowMatchIndices. If the column entity is not matched
+any row entity, set -1 in ColToRowMatchIndices.
+
+Please note that the input DistMat can be LoDTensor (with LoD) or Tensor.
+If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
+If Tensor, the height of ColToRowMatchIndices is 1.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(bipartite_match, ops::BipartiteMatchOp,
+                  ops::BipartiteMatchOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(bipartite_match, ops::BipartiteMatchKernel<float>,
+                       ops::BipartiteMatchKernel<double>);
diff --git a/paddle/fluid/operators/box_coder_op.cc b/paddle/fluid/operators/box_coder_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e0fee22d8d828978fe74bd48e46ea6c8063150d
--- /dev/null
+++ b/paddle/fluid/operators/box_coder_op.cc
@@ -0,0 +1,121 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/box_coder_op.h"
+
+namespace paddle {
+namespace operators {
+
+class BoxCoderOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
+                   "Input(PriorBox) of BoxCoderOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"),
+                   "Input(PriorBoxVar) of BoxCoderOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("TargetBox"),
+                   "Input(TargetBox) of BoxCoderOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutputBox"),
+                   "Output(OutputBox) of BoxCoderOp should not be null.");
+
+    auto prior_box_dims = ctx->GetInputDim("PriorBox");
+    auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
+    auto target_box_dims = ctx->GetInputDim("TargetBox");
+
+    PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
+                      "The rank of Input of PriorBoxVar must be 2");
+    PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
+    PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
+    PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
+                      "The rank of Input of TargetBox must be 2");
+    PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
+                      "The shape of TargetBox is [M, 4]");
+
+    GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
+
+    ctx->SetOutputDim(
+        "OutputBox",
+        framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
+    ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
+  }
+};
+
+class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "PriorBox",
+        "(Tensor, default Tensor<float>) "
+        "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, "
+        "each box is represented as [xmin, ymin, xmax, ymax], "
+        "[xmin, ymin] is the left top coordinate of the anchor box, "
+        "if the input is image feature map, they are close to the origin "
+        "of the coordinate system. [xmax, ymax] is the right bottom "
+        "coordinate of the anchor box.");
+    AddInput("PriorBoxVar",
+             "(Tensor, default Tensor<float>) "
+             "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group "
+             "of variance.");
+    AddInput(
+        "TargetBox",
+        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
+        "[N, 4], each box is represented as [xmin, ymin, xmax, ymax], "
+        "[xmin, ymin] is the left top coordinate of the box if the input "
+        "is image feature map, they are close to the origin of the coordinate "
+        "system. [xmax, ymax] is the right bottom coordinate of the box. "
+        "This tensor can contain LoD information to represent a batch "
+        "of inputs. One instance of this batch can contain different "
+        "numbers of entities.");
+    AddAttr<std::string>("code_type",
+                         "(string, default encode_center_size) "
+                         "the code type used with the target box")
+        .SetDefault("encode_center_size")
+        .InEnum({"encode_center_size", "decode_center_size"});
+    AddOutput(
+        "OutputBox",
+        "(LoDTensor or Tensor) "
+        "(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] "
+        "representing the result of N target boxes encoded/decoded with "
+        "M Prior boxes and variances.");
+
+    AddComment(R"DOC(
+Bounding Box Coder Operator.
+Encode/Decode the target bounding box with the priorbox information.
+The Encoding schema described below:
+ox = (tx - px) / pw / pxv
+oy = (ty - py) / ph / pyv
+ow = log(abs(tw / pw)) / pwv 
+oh = log(abs(th / ph)) / phv 
+The Decoding schema described below:
+ox = (pw * pxv * tx * + px) - tw / 2
+oy = (ph * pyv * ty * + py) - th / 2
+ow = exp(pwv * tw) * pw + tw / 2
+oh = exp(phv * th) * ph + th / 2
+where tx, ty, tw, th denote the target box's center coordinates, width and
+height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
+center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
+of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
+width and height.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker);
+REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>,
+                       ops::BoxCoderKernel<double>);
diff --git a/paddle/fluid/operators/box_coder_op.cu b/paddle/fluid/operators/box_coder_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..dd9299ceacdf2507f51f895c71041c1645dd8371
--- /dev/null
+++ b/paddle/fluid/operators/box_coder_op.cu
@@ -0,0 +1,150 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/box_coder_op.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void EncodeCenterSizeKernel(const T* prior_box_data,
+                                       const T* prior_box_var_data,
+                                       const T* target_box_data, const int row,
+                                       const int col, const int len,
+                                       T* output) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < row * col) {
+    const int row_idx = idx / col;
+    const int col_idx = idx % col;
+    T prior_box_width =
+        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
+    T prior_box_height =
+        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
+    T prior_box_center_x =
+        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
+    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
+                            prior_box_data[col_idx * len + 1]) /
+                           2;
+
+    T target_box_center_x =
+        (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) /
+        2;
+    T target_box_center_y = (target_box_data[row_idx * len + 3] +
+                             target_box_data[row_idx * len + 1]) /
+                            2;
+    T target_box_width =
+        target_box_data[row_idx * len + 2] - target_box_data[row_idx * len];
+    T target_box_height =
+        target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1];
+
+    output[idx * len] = (target_box_center_x - prior_box_center_x) /
+                        prior_box_width / prior_box_var_data[col_idx * len];
+    output[idx * len + 1] = (target_box_center_y - prior_box_center_y) /
+                            prior_box_height /
+                            prior_box_var_data[col_idx * len + 1];
+    output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) /
+                            prior_box_var_data[col_idx * len + 2];
+    output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) /
+                            prior_box_var_data[col_idx * len + 3];
+  }
+}
+
+template <typename T>
+__global__ void DecodeCenterSizeKernel(const T* prior_box_data,
+                                       const T* prior_box_var_data,
+                                       const T* target_box_data, const int row,
+                                       const int col, const int len,
+                                       T* output) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < row * col) {
+    const int row_idx = idx / col;
+    const int col_idx = idx % col;
+    T prior_box_width =
+        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
+    T prior_box_height =
+        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
+    T prior_box_center_x =
+        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
+    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
+                            prior_box_data[col_idx * len + 1]) /
+                           2;
+
+    T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
+                             target_box_data[row_idx * len + 2]) *
+                         prior_box_width;
+    T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
+                              target_box_data[row_idx * len + 3]) *
+                          prior_box_height;
+    T target_box_center_x = prior_box_var_data[col_idx * len] *
+                                target_box_data[row_idx * len] *
+                                prior_box_width +
+                            prior_box_center_x;
+    T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
+                                target_box_data[row_idx * len + 1] *
+                                prior_box_height +
+                            prior_box_center_y;
+
+    output[idx * len] = target_box_center_x - target_box_width / 2;
+    output[idx * len + 1] = target_box_center_y - target_box_height / 2;
+    output[idx * len + 2] = target_box_center_x + target_box_width / 2;
+    output[idx * len + 3] = target_box_center_y + target_box_height / 2;
+  }
+}
+
+template <typename T>
+class BoxCoderCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
+    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
+    auto* output_box = context.Output<framework::Tensor>("OutputBox");
+
+    if (target_box->lod().size()) {
+      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
+                        "Only support 1 level of LoD.");
+    }
+    auto row = target_box->dims()[0];
+    auto col = prior_box->dims()[0];
+    auto len = prior_box->dims()[1];
+    int block = 512;
+    int grid = (row * col + block - 1) / block;
+    auto& device_ctx = context.cuda_device_context();
+
+    const T* prior_box_data = prior_box->data<T>();
+    const T* prior_box_var_data = prior_box_var->data<T>();
+    const T* target_box_data = target_box->data<T>();
+
+    output_box->mutable_data<T>({row, col, len}, context.GetPlace());
+    T* output = output_box->data<T>();
+
+    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    if (code_type == BoxCodeType::kEncodeCenterSize) {
+      EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
+          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
+          output);
+    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
+      DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
+          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
+          output);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel<float>,
+                        ops::BoxCoderCUDAKernel<double>);
diff --git a/paddle/fluid/operators/box_coder_op.h b/paddle/fluid/operators/box_coder_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c41bcc212b8fcfc4a274a53db4b25161ecdb3fe5
--- /dev/null
+++ b/paddle/fluid/operators/box_coder_op.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };
+
+inline BoxCodeType GetBoxCodeType(const std::string& type) {
+  if (type == "encode_center_size") {
+    return BoxCodeType::kEncodeCenterSize;
+  } else if (type == "decode_center_size") {
+    return BoxCodeType::kDecodeCenterSize;
+  }
+  PADDLE_THROW("Not support type %s.", type);
+}
+
+template <typename T>
+class BoxCoderKernel : public framework::OpKernel<T> {
+ public:
+  void EncodeCenterSize(const framework::Tensor& target_box,
+                        const framework::Tensor& prior_box,
+                        const framework::Tensor& prior_box_var,
+                        T* output) const {
+    int64_t row = target_box.dims()[0];
+    int64_t col = prior_box.dims()[0];
+    int64_t len = prior_box.dims()[1];
+    auto* target_box_data = target_box.data<T>();
+    auto* prior_box_data = prior_box.data<T>();
+    auto* prior_box_var_data = prior_box_var.data<T>();
+
+    for (int64_t i = 0; i < row; ++i) {
+      for (int64_t j = 0; j < col; ++j) {
+        T prior_box_width =
+            prior_box_data[j * len + 2] - prior_box_data[j * len];
+        T prior_box_height =
+            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+        T prior_box_center_x =
+            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
+        T prior_box_center_y =
+            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
+
+        T target_box_center_x =
+            (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
+        T target_box_center_y =
+            (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
+        T target_box_width =
+            target_box_data[i * len + 2] - target_box_data[i * len];
+        T target_box_height =
+            target_box_data[i * len + 3] - target_box_data[i * len + 1];
+
+        size_t offset = i * col * len + j * len;
+        output[offset] = (target_box_center_x - prior_box_center_x) /
+                         prior_box_width / prior_box_var_data[j * len];
+        output[offset + 1] = (target_box_center_y - prior_box_center_y) /
+                             prior_box_height / prior_box_var_data[j * len + 1];
+        output[offset + 2] =
+            std::log(std::fabs(target_box_width / prior_box_width)) /
+            prior_box_var_data[j * len + 2];
+        output[offset + 3] =
+            std::log(std::fabs(target_box_height / prior_box_height)) /
+            prior_box_var_data[j * len + 3];
+      }
+    }
+  }
+  void DecodeCenterSize(const framework::Tensor& target_box,
+                        const framework::Tensor& prior_box,
+                        const framework::Tensor& prior_box_var,
+                        T* output) const {
+    int64_t row = target_box.dims()[0];
+    int64_t col = prior_box.dims()[0];
+    int64_t len = prior_box.dims()[1];
+
+    auto* target_box_data = target_box.data<T>();
+    auto* prior_box_data = prior_box.data<T>();
+    auto* prior_box_var_data = prior_box_var.data<T>();
+
+    for (int64_t i = 0; i < row; ++i) {
+      for (int64_t j = 0; j < col; ++j) {
+        T prior_box_width =
+            prior_box_data[j * len + 2] - prior_box_data[j * len];
+        T prior_box_height =
+            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+        T prior_box_center_x =
+            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
+        T prior_box_center_y =
+            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
+
+        T target_box_center_x = prior_box_var_data[j * len] *
+                                    target_box_data[i * len] * prior_box_width +
+                                prior_box_center_x;
+        T target_box_center_y = prior_box_var_data[j * len + 1] *
+                                    target_box_data[i * len + 1] *
+                                    prior_box_height +
+                                prior_box_center_y;
+        T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
+                                      target_box_data[i * len + 2]) *
+                             prior_box_width;
+        T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
+                                       target_box_data[i * len + 3]) *
+                              prior_box_height;
+
+        size_t offset = i * col * len + j * len;
+        output[offset] = target_box_center_x - target_box_width / 2;
+        output[offset + 1] = target_box_center_y - target_box_height / 2;
+        output[offset + 2] = target_box_center_x + target_box_width / 2;
+        output[offset + 3] = target_box_center_y + target_box_height / 2;
+      }
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
+    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
+    auto* output_box = context.Output<framework::Tensor>("OutputBox");
+
+    if (target_box->lod().size()) {
+      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL,
+                        "Only support 1 level of LoD.");
+    }
+    auto row = target_box->dims()[0];
+    auto col = prior_box->dims()[0];
+    auto len = prior_box->dims()[1];
+
+    output_box->mutable_data<T>({row, col, len}, context.GetPlace());
+
+    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    T* output = output_box->data<T>();
+    if (code_type == BoxCodeType::kEncodeCenterSize) {
+      EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
+    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
+      DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..364c21f7619910784d63047f3abb3713f1bfd0fc
--- /dev/null
+++ b/paddle/fluid/operators/cast_op.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/cast_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CastOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensor of cast op");
+    AddOutput("Out", "The output tensor of cast op");
+    AddAttr<int>("out_dtype", "output data type");
+    AddAttr<int>("in_dtype", "input data type");
+    AddComment(R"DOC(
+Cast Operator.
+
+This Operator casts the input tensor to another data type and
+returns tha Output Tensor.
+
+)DOC");
+  }
+};
+
+class CastOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"), "The input of cast op must be set");
+    PADDLE_ENFORCE(context->HasOutput("Out"),
+                   "The output of cast op must be set");
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+    context->ShareLoD("X", "Out");
+  }
+};
+
+class CastOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto grad = new framework::OpDesc();
+    grad->SetType("cast");
+    grad->SetInput("X", OutputGrad("Out"));
+    grad->SetOutput("Out", InputGrad("X"));
+    grad->SetAttr("out_dtype", GetAttr("in_dtype"));
+    grad->SetAttr("in_dtype", GetAttr("out_dtype"));
+    return std::unique_ptr<framework::OpDesc>(grad);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape,
+                        ops::CastOpProtoMaker);
+REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
+                       ops::CastOpKernel<CPU, double>,
+                       ops::CastOpKernel<CPU, int>,
+                       ops::CastOpKernel<CPU, int64_t>,
+                       ops::CastOpKernel<CPU, bool>);
diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fb597be9d93af12afb608bf87382c283ddf78e7c
--- /dev/null
+++ b/paddle/fluid/operators/cast_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/cast_op.h"
+
+template <typename T>
+using CastOpKernel =
+    paddle::operators::CastOpKernel<paddle::platform::CUDADeviceContext, T>;
+
+REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
+                        CastOpKernel<int>, CastOpKernel<int64_t>,
+                        CastOpKernel<bool>);
diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ab4961cef4bd6e7d4e592581b51d7d4eb896ec7
--- /dev/null
+++ b/paddle/fluid/operators/cast_op.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename InT, typename OutT>
+struct CastOpTransformFunctor {
+  HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
+};
+
+template <typename DeviceContext, typename InT>
+struct CastOpFunctor {
+  const framework::Tensor* in_;
+  framework::Tensor* out_;
+  const DeviceContext& ctx_;
+  CastOpFunctor(const framework::Tensor* in, framework::Tensor* out,
+                const DeviceContext& ctx)
+      : in_(in), out_(out), ctx_(ctx) {}
+
+  template <typename OutT>
+  void operator()() const {
+    auto* in_begin = in_->data<InT>();
+    auto numel = in_->numel();
+    auto* in_end = in_begin + numel;
+    auto* out_begin = out_->mutable_data<OutT>(ctx_.GetPlace());
+    platform::Transform<DeviceContext> trans;
+    trans(ctx_, in_begin, in_end, out_begin,
+          CastOpTransformFunctor<InT, OutT>());
+  }
+};
+
+template <typename DeviceContext, typename InT>
+class CastOpKernel : public framework::OpKernel<InT> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    framework::VisitDataType(
+        static_cast<framework::proto::DataType>(context.Attr<int>("out_dtype")),
+        CastOpFunctor<DeviceContext, InT>(
+            in, out, context.template device_context<DeviceContext>()));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/chunk_eval_op.cc b/paddle/fluid/operators/chunk_eval_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..080e4d80da4752a0c6bea86c0a9f503cf46e8878
--- /dev/null
+++ b/paddle/fluid/operators/chunk_eval_op.cc
@@ -0,0 +1,165 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/chunk_eval_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ChunkEvalOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Inference"),
+                   "Input(Inference) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"),
+                   "Input(Label) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Precision"),
+                   "Output(Precision) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Recall"),
+                   "Output(Recall) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("F1-Score"),
+                   "Output(F1-Score) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("NumInferChunks"),
+                   "Output(NumInferChunks) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("NumLabelChunks"),
+                   "Output(NumLabelChunks) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("NumCorrectChunks"),
+        "Output(NumCorrectChunks) of ChunkEvalOp should not be null.");
+
+    auto inference_dim = ctx->GetInputDim("Inference");
+    auto label_dim = ctx->GetInputDim("Label");
+
+    PADDLE_ENFORCE(inference_dim == label_dim,
+                   "Inference's shape must be the same as Label's shape.");
+
+    ctx->SetOutputDim("Precision", {1});
+    ctx->SetOutputDim("Recall", {1});
+    ctx->SetOutputDim("F1-Score", {1});
+    ctx->SetOutputDim("NumInferChunks", {1});
+    ctx->SetOutputDim("NumLabelChunks", {1});
+    ctx->SetOutputDim("NumCorrectChunks", {1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(framework::proto::DataType::FP32,
+                                   platform::CPUPlace());
+  }
+};
+
+class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ChunkEvalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Inference",
+             "(Tensor, default: Tensor<int64_t>). "
+             "Predictions from the network.");
+    AddInput("Label",
+             "(Tensor, default: Tensor<int64_t>). The true tag sequences.");
+    AddOutput("Precision",
+              "(float). The evaluated precision (called positive predictive "
+              "value) of chunks on the given mini-batch.");
+    AddOutput("Recall",
+              "(float). The evaluated recall (true positive rate or "
+              "sensitivity) of chunks on the given mini-batch.");
+    AddOutput("F1-Score",
+              "(float). The evaluated F1-Score on the given mini-batch.");
+    AddOutput("NumInferChunks",
+              "(int64_t). The number of chunks in Inference on the given "
+              "mini-batch.");
+    AddOutput(
+        "NumLabelChunks",
+        "(int64_t). The number of chunks in Label on the given mini-batch.");
+    AddOutput(
+        "NumCorrectChunks",
+        "(int64_t). The number of chunks both in Inference and Label on the "
+        "given mini-batch.");
+    AddAttr<int>("num_chunk_types",
+                 "(int). The number of chunk type. See below for details.");
+    AddAttr<std::string>(
+        "chunk_scheme",
+        "(string, default IOB). The labeling scheme indicating "
+        "how to encode the chunks. Must be IOB, IOE, IOBES or plain. See below "
+        "for details.")
+        .SetDefault("IOB");
+    AddAttr<std::vector<int>>("excluded_chunk_types",
+                              "(list<int>) A list including chunk type ids "
+                              "indicating chunk types that are not counted. "
+                              "See below for details.")
+        .SetDefault(std::vector<int>{});
+    AddComment(R"DOC(
+For some basics of chunking, please refer to
+‘Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>’.
+
+
+CheckEvalOp computes the precision, recall, and F1-score of chunk detection,
+and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
+Here is a NER example of labeling for these tagging schemes:
+
+ 	     Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
+  IO:    I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
+  IOB:   B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
+  IOE:   I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
+  IOBES: B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
+
+There are three chunk types(named entity types) including PER(person), ORG(organization)
+and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
+
+Since the calculations actually use label ids rather than labels, extra attention
+should be paid when mapping labels to ids to make CheckEvalOp work. The key point
+is that the listed equations are satisfied by ids.
+
+    tag_type = label % num_tag_type
+    chunk_type = label / num_tag_type
+
+where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
+is the num of chunk types, and `tag_type` get its value from the following table.
+
+    Scheme Begin Inside End   Single
+     plain   0     -      -     -
+     IOB     0     1      -     -
+     IOE     -     0      1     -
+     IOBES   0     1      2     3
+
+Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG,
+PER and LOC. To satisfy the above equations, the label map can be like this:
+
+    B-ORG  0
+    I-ORG  1
+    B-PER  2
+    I-PER  3
+    B-LOC  4
+    I-LOC  5
+    O      6
+
+It’s not hard to verify the equations noting that the num of chunk types
+is 3 and the num of tag types in IOB scheme is 2. For example, the label
+id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
+I-LOC is 2, which consistent with the results from the equations.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(chunk_eval, ops::ChunkEvalOp,
+                             ops::ChunkEvalOpMaker);
+REGISTER_OP_CPU_KERNEL(chunk_eval,
+                       ops::ChunkEvalKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/chunk_eval_op.h b/paddle/fluid/operators/chunk_eval_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3dca3d2c0f99c3b2d447ffe4516c1b6c379b13f2
--- /dev/null
+++ b/paddle/fluid/operators/chunk_eval_op.h
@@ -0,0 +1,236 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <set>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class ChunkEvalKernel : public framework::OpKernel<T> {
+ public:
+  struct Segment {
+    int begin;
+    int end;
+    int type;
+    bool operator==(const Segment& y) const {
+      return begin == y.begin && end == y.end && type == y.type;
+    }
+  };
+
+  void GetSegments(const int64_t* label, int length,
+                   std::vector<Segment>& segments, int num_chunk_types,
+                   int num_tag_types, int other_chunk_type, int tag_begin,
+                   int tag_inside, int tag_end, int tag_single) const {
+    segments.clear();
+    segments.reserve(length);
+    int chunk_start = 0;
+    bool in_chunk = false;
+    int tag = -1;
+    int type = other_chunk_type;
+    for (int i = 0; i < length; ++i) {
+      int prev_tag = tag;
+      int prev_type = type;
+      PADDLE_ENFORCE_LE(label[i], num_chunk_types * num_tag_types);
+      tag = label[i] % num_tag_types;
+      type = label[i] / num_tag_types;
+      if (in_chunk && ChunkEnd(prev_tag, prev_type, tag, type, other_chunk_type,
+                               tag_begin, tag_inside, tag_end, tag_single)) {
+        Segment segment{
+            chunk_start,  // begin
+            i - 1,        // end
+            prev_type,
+        };
+        segments.push_back(segment);
+        in_chunk = false;
+      }
+      if (ChunkBegin(prev_tag, prev_type, tag, type, other_chunk_type,
+                     tag_begin, tag_inside, tag_end, tag_single)) {
+        chunk_start = i;
+        in_chunk = true;
+      }
+    }
+    if (in_chunk) {
+      Segment segment{
+          chunk_start,  // begin
+          length - 1,   // end
+          type,
+      };
+      segments.push_back(segment);
+    }
+  }
+
+  bool ChunkEnd(int prev_tag, int prev_type, int tag, int type,
+                int other_chunk_type, int tag_begin, int tag_inside,
+                int tag_end, int tag_single) const {
+    if (prev_type == other_chunk_type) return false;
+    if (type == other_chunk_type) return true;
+    if (type != prev_type) return true;
+    if (prev_tag == tag_begin) return tag == tag_begin || tag == tag_single;
+    if (prev_tag == tag_inside) return tag == tag_begin || tag == tag_single;
+    if (prev_tag == tag_end) return true;
+    if (prev_tag == tag_single) return true;
+    return false;
+  }
+
+  bool ChunkBegin(int prev_tag, int prev_type, int tag, int type,
+                  int other_chunk_type, int tag_begin, int tag_inside,
+                  int tag_end, int tag_single) const {
+    if (prev_type == other_chunk_type) return type != other_chunk_type;
+    if (type == other_chunk_type) return false;
+    if (type != prev_type) return true;
+    if (tag == tag_begin) return true;
+    if (tag == tag_inside) return prev_tag == tag_end || prev_tag == tag_single;
+    if (tag == tag_end) return prev_tag == tag_end || prev_tag == tag_single;
+    if (tag == tag_single) return true;
+    return false;
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    // initialize to parse configurations
+    int num_chunk_types, num_tag_types;
+    int other_chunk_type;
+    int tag_begin, tag_inside, tag_end, tag_single;
+    std::vector<Segment> label_segments;
+    std::vector<Segment> output_segments;
+    std::set<int> excluded_chunk_types;
+
+    if (context.Attr<std::string>("chunk_scheme") == "IOB") {
+      num_tag_types = 2;
+      tag_begin = 0;
+      tag_inside = 1;
+      tag_end = -1;
+      tag_single = -1;
+    } else if (context.Attr<std::string>("chunk_scheme") == "IOE") {
+      num_tag_types = 2;
+      tag_begin = -1;
+      tag_inside = 0;
+      tag_end = 1;
+      tag_single = -1;
+    } else if (context.Attr<std::string>("chunk_scheme") == "IOBES") {
+      num_tag_types = 4;
+      tag_begin = 0;
+      tag_inside = 1;
+      tag_end = 2;
+      tag_single = 3;
+    } else if (context.Attr<std::string>("chunk_scheme") == "plain") {
+      num_tag_types = 1;
+      tag_begin = -1;
+      tag_inside = -1;
+      tag_end = -1;
+      tag_single = -1;
+    } else {
+      PADDLE_THROW("Unknown chunk scheme.");
+    }
+    other_chunk_type = num_chunk_types = context.Attr<int>("num_chunk_types");
+    excluded_chunk_types.insert(
+        context.Attr<std::vector<int>>("excluded_chunk_types").begin(),
+        context.Attr<std::vector<int>>("excluded_chunk_types").end());
+
+    auto* inference = context.Input<LoDTensor>("Inference");
+    auto place = inference->place();
+    auto* label = context.Input<LoDTensor>("Label");
+    auto* precision = context.Output<Tensor>("Precision");
+    auto* recall = context.Output<Tensor>("Recall");
+    auto* f1 = context.Output<Tensor>("F1-Score");
+    auto* num_infer_chunks = context.Output<Tensor>("NumInferChunks");
+    auto* num_label_chunks = context.Output<Tensor>("NumLabelChunks");
+    auto* num_correct_chunks = context.Output<Tensor>("NumCorrectChunks");
+
+    const int64_t* inference_data = inference->data<int64_t>();
+    const int64_t* label_data = label->data<int64_t>();
+    T* precision_data = precision->mutable_data<T>(place);
+    T* racall_data = recall->mutable_data<T>(place);
+    T* f1_data = f1->mutable_data<T>(place);
+    int64_t* num_infer_chunks_data =
+        num_infer_chunks->mutable_data<int64_t>(place);
+    int64_t* num_label_chunks_data =
+        num_label_chunks->mutable_data<int64_t>(place);
+    int64_t* num_correct_chunks_data =
+        num_correct_chunks->mutable_data<int64_t>(place);
+    *num_infer_chunks_data = 0;
+    *num_label_chunks_data = 0;
+    *num_correct_chunks_data = 0;
+
+    auto lod = label->lod();
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE(lod == inference->lod(),
+                   "LoD must be same between Inference and Label.");
+    int num_sequences = lod[0].size() - 1;
+    for (int i = 0; i < num_sequences; ++i) {
+      int seq_length = lod[0][i + 1] - lod[0][i];
+      EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length,
+                 output_segments, label_segments, *num_infer_chunks_data,
+                 *num_label_chunks_data, *num_correct_chunks_data,
+                 num_chunk_types, num_tag_types, other_chunk_type, tag_begin,
+                 tag_inside, tag_end, tag_single, excluded_chunk_types);
+    }
+    *precision_data = !(*num_infer_chunks_data)
+                          ? 0
+                          : static_cast<T>(*num_correct_chunks_data) /
+                                (*num_infer_chunks_data);
+    *racall_data = !(*num_label_chunks_data)
+                       ? 0
+                       : static_cast<T>(*num_correct_chunks_data) /
+                             (*num_label_chunks_data);
+    *f1_data = !(*num_correct_chunks_data)
+                   ? 0
+                   : 2 * (*precision_data) * (*racall_data) /
+                         ((*precision_data) + (*racall_data));
+  }
+
+  void EvalOneSeq(const int64_t* output, const int64_t* label, int length,
+                  std::vector<Segment>& output_segments,
+                  std::vector<Segment>& label_segments,
+                  int64_t& num_output_segments, int64_t& num_label_segments,
+                  int64_t& num_correct, int num_chunk_types, int num_tag_types,
+                  int other_chunk_type, int tag_begin, int tag_inside,
+                  int tag_end, int tag_single,
+                  const std::set<int>& excluded_chunk_types) const {
+    GetSegments(output, length, output_segments, num_chunk_types, num_tag_types,
+                other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
+    GetSegments(label, length, label_segments, num_chunk_types, num_tag_types,
+                other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
+    size_t i = 0, j = 0;
+    while (i < output_segments.size() && j < label_segments.size()) {
+      if (output_segments[i] == label_segments[j] &&
+          excluded_chunk_types.count(output_segments[i].type) != 1) {
+        ++num_correct;
+      }
+      if (output_segments[i].end < label_segments[j].end) {
+        ++i;
+      } else if (output_segments[i].end > label_segments[j].end) {
+        ++j;
+      } else {
+        ++i;
+        ++j;
+      }
+    }
+    for (auto& segment : label_segments) {
+      if (excluded_chunk_types.count(segment.type) != 1) ++num_label_segments;
+    }
+    for (auto& segment : output_segments) {
+      if (excluded_chunk_types.count(segment.type) != 1) ++num_output_segments;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/clip_by_norm_op.cc b/paddle/fluid/operators/clip_by_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..89df118c06f4df6444fc1f61b5ddde48f6ad8ba7
--- /dev/null
+++ b/paddle/fluid/operators/clip_by_norm_op.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/clip_by_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ClipByNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ClipByNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ClipByNormOp should not be null.");
+    auto max_norm = ctx->Attrs().Get<float>("max_norm");
+    PADDLE_ENFORCE_GT(max_norm, 0, "max_norm should be greater than 0.");
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ClipByNormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor) The input of clip_by_norm op."
+             "The number of dimensions must be between [1, 9].");
+    AddOutput("Out",
+              "(Tensor) The output of clip_by_norm op with shape as input(X)");
+    AddAttr<float>("max_norm", "(float) The maximum norm value.");
+    AddComment(R"DOC(
+ClipByNorm Operator.
+
+This operator limits the L2 norm of the input $X$ within $max\_norm$.
+If the L2 norm of $X$ is less than or equal to $max\_norm$, $Out$ will be
+the same as $X$. If the L2 norm of $X$ is greater than $max\_norm$, $X$ will
+be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as
+shown in the following formula:
+
+$$
+Out = \frac{max\_norm * X}{norm(X)},
+$$
+
+where $norm(X)$ represents the L2 norm of $X$.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp,
+                             ops::ClipByNormOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    clip_by_norm,
+    ops::ClipByNormKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/clip_by_norm_op.cu b/paddle/fluid/operators/clip_by_norm_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a466b335914f1fde6865c6cc375f4ef009632e41
--- /dev/null
+++ b/paddle/fluid/operators/clip_by_norm_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/clip_by_norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    clip_by_norm,
+    ops::ClipByNormKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..82bcf07657bfbf1df6b541b3953285622fb25a87
--- /dev/null
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class ClipByNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto max_norm = context.Attr<T>("max_norm");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+
+    auto x = EigenVector<T>::Flatten(*input);
+    auto out = EigenVector<T>::Flatten(*output);
+    auto x_norm = x.square().sum().sqrt();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    auto temp = (x_norm <= max_norm).template cast<T>().eval();
+    auto scaling = temp + (static_cast<T>(1) - temp) * max_norm / x_norm;
+    Eigen::array<int, 1> one_dim{{1}};
+    Eigen::DSizes<int, 1> m_dsize(input->numel());
+    out.device(place) = x * scaling.reshape(one_dim).broadcast(m_dsize);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..76b2cefbf9dd67c5036b6ecfe35a9d53a54467a9
--- /dev/null
+++ b/paddle/fluid/operators/clip_op.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/clip_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ClipOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ClipOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ClipOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto max = ctx->Attrs().Get<float>("max");
+    auto min = ctx->Attrs().Get<float>("min");
+    PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+template <typename AttrType>
+class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor)The input of clip op."
+             "The number of dimensions must be between [1, 9].");
+    AddOutput("Out", "(Tensor)The output of clip op with shape as input(X)");
+    AddAttr<AttrType>(
+        "min", "(float)Minimum value, under which element is replaced by min.");
+    AddAttr<AttrType>(
+        "max", "(float)Maximum value, above which element is replaced by max");
+    AddComment(R"DOC(
+Clip Operator.
+
+The clip operator limits the value of given input within an interval. The
+interval is specified with arguments 'min' and 'max':
+
+$$
+Out = \min(\max(X, min), max)
+$$
+
+)DOC");
+  }
+};
+
+class ClipOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
+            ops::ClipOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/clip_op.cu b/paddle/fluid/operators/clip_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7b044d6e699d59dda04fa19468c17faf1e0a0eb7
--- /dev/null
+++ b/paddle/fluid/operators/clip_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/clip_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    clip, ops::ClipKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    clip_grad, ops::ClipGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..aecd6f83bfaf4deab4271e859bea10feecacab62
--- /dev/null
+++ b/paddle/fluid/operators/clip_op.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using platform::Transform;
+
+template <typename T>
+class ClipFunctor {
+ public:
+  explicit ClipFunctor(const T min, const T max) : min_(min), max_(max) {}
+  HOSTDEVICE T operator()(const T& x) const {
+    if (x < min_)
+      return min_;
+    else if (x > max_)
+      return max_;
+    else
+      return x;
+  }
+
+ private:
+  T min_;
+  T max_;
+};
+
+template <typename T>
+class ClipGradFunctor {
+ public:
+  explicit ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {}
+  HOSTDEVICE T operator()(const T& x, const T& y) const {
+    return (y > min_ && y < max_) ? x : 0;
+  }
+
+ private:
+  T min_;
+  T max_;
+};
+
+template <typename DeviceContext, typename T>
+class ClipKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto max = context.Attr<T>("max");
+    auto min = context.Attr<T>("min");
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    const T* x_data = x->data<T>();
+    int64_t numel = x->numel();
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x_data,
+          x_data + numel, out_data, ClipFunctor<T>(min, max));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ClipGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto max = context.Attr<T>("max");
+    auto min = context.Attr<T>("min");
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
+    if (d_x != nullptr) {
+      auto* x = context.Input<Tensor>("X");
+      int64_t numel = d_out->numel();
+      auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
+      const T* d_out_data = d_out->data<T>();
+      const T* x_data = x->data<T>();
+      Transform<DeviceContext> trans;
+      trans(context.template device_context<DeviceContext>(), d_out_data,
+            d_out_data + numel, x_data, d_x_data, ClipGradFunctor<T>(min, max));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/compare_op.cc b/paddle/fluid/operators/compare_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f3414c33b5ab3cc8dffee640fd85b9625b3f237b
--- /dev/null
+++ b/paddle/fluid/operators/compare_op.cc
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/compare_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename OpComment>
+class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CompareOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    OpComment comment;
+    AddInput("X",
+             string::Sprintf("(LoDTensor) the left hand operand of %s operator",
+                             comment.type));
+    AddInput("Y", string::Sprintf(
+                      "(LoDTensor) the right hand operand of %s operator",
+                      comment.type));
+    AddOutput("Out", string::Sprintf(
+                         "(LoDTensor) n-dim bool tensor. Each element is %s",
+                         comment.equation));
+    AddComment(string::Sprintf(R"DOC(%s Operator
+
+It operates element-wise on X and Y, and returns the Out. Each of them is a
+N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
+calculated by %s
+)DOC",
+                               comment.type, comment.equation));
+    AddAttr<int>("axis",
+                 "(int, default -1). The start dimension index "
+                 "for broadcasting Y onto X.")
+        .SetDefault(-1)
+        .EqualGreaterThan(-1);
+  }
+};
+
+template <typename OpComment>
+class CompareOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    OpComment comment;
+    PADDLE_ENFORCE(context->HasInput("X"), "%s operator must has input X",
+                   comment.type);
+    PADDLE_ENFORCE(context->HasInput("Y"), "%s operator must has input Y",
+                   comment.type);
+    auto dim_x = context->GetInputDim("X");
+    auto dim_y = context->GetInputDim("Y");
+    PADDLE_ENFORCE_GE(dim_x.size(), dim_y.size(),
+                      "The size of dim_y should not be greater than dim_x's.");
+
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+    context->ShareLoD("X", "Out");
+  }
+};
+
+class CompareOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
+    // CompareOp kernel's device type is decided by input tensor place
+    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+    return kt;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_LOGICAL_OP(op_type, _equation)                      \
+  struct _##op_type##Comment {                                       \
+    static char type[];                                              \
+    static char equation[];                                          \
+  };                                                                 \
+  char _##op_type##Comment::type[]{#op_type};                        \
+  char _##op_type##Comment::equation[]{_equation};                   \
+  REGISTER_OPERATOR(                                                 \
+      op_type, ::paddle::operators::CompareOp,                       \
+      ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \
+      ::paddle::operators::CompareOpInferShape<_##op_type##Comment>, \
+      ::paddle::framework::EmptyGradOpMaker);
+
+REGISTER_LOGICAL_OP(less_than, "Out = X < Y");
+REGISTER_LOGICAL_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_OP(less_equal, "Out = X <= Y");
+REGISTER_LOGICAL_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
+REGISTER_LOGICAL_OP(equal, "Out = X == Y");
+REGISTER_LOGICAL_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
diff --git a/paddle/fluid/operators/compare_op.cu b/paddle/fluid/operators/compare_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3507af2ae3add8cf02f5b9f3b3d89b40d73bcb0d
--- /dev/null
+++ b/paddle/fluid/operators/compare_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/compare_op.h"
+
+REGISTER_LOGICAL_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);
+REGISTER_LOGICAL_KERNEL(equal, CUDA, paddle::operators::EqualFunctor);
diff --git a/paddle/fluid/operators/compare_op.h b/paddle/fluid/operators/compare_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b2ee5a9d68f5f1fd3d2d374669763855659f1db
--- /dev/null
+++ b/paddle/fluid/operators/compare_op.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <math.h>
+#include <type_traits>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct LessThanFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a < b; }
+};
+
+template <typename T>
+struct LessEqualFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a <= b; }
+};
+
+template <typename T>
+struct EqualFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const {
+    if (std::is_floating_point<T>::value) {
+      // This branch will be optimized while compiling if T is integer. It is
+      // safe to cast a and b to double.
+      return fabs(static_cast<double>(a - b)) < 1e-8;
+    } else {
+      return (a == b);
+    }
+  }
+};
+
+template <typename DeviceContext, typename Functor>
+class CompareOpKernel
+    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using T = typename Functor::ELEM_TYPE;
+    using Tensor = framework::Tensor;
+
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Input<Tensor>("Y");
+    auto* z = context.Output<Tensor>("Out");
+    z->mutable_data<T>(context.GetPlace());
+    int axis = context.Attr<int>("axis");
+    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
+                                                          Functor(), z);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor)                    \
+  REGISTER_OP_##dev##_KERNEL(                                             \
+      op_type, ::paddle::operators::CompareOpKernel<                      \
+                   ::paddle::platform::dev##DeviceContext, functor<int>>, \
+      ::paddle::operators::CompareOpKernel<                               \
+          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,      \
+      ::paddle::operators::CompareOpKernel<                               \
+          ::paddle::platform::dev##DeviceContext, functor<float>>,        \
+      ::paddle::operators::CompareOpKernel<                               \
+          ::paddle::platform::dev##DeviceContext, functor<double>>);
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..68eb5412beb02d9dc948eeb188a2d5b1cdb0c5b3
--- /dev/null
+++ b/paddle/fluid/operators/concat_op.cc
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/concat_op.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+class ConcatOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
+                      "Inputs(X) of ConcatOp should be empty.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ConcatOp should not be null.");
+
+    auto ins = ctx->GetInputsDim("X");
+    size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
+    const size_t n = ins.size();
+
+    PADDLE_ENFORCE_GT(n, 1, "Input tensors count should > 1.");
+
+    auto out_dims = ins[0];
+    size_t in_zero_dims_size = out_dims.size();
+    for (size_t i = 1; i < n; i++) {
+      for (size_t j = 0; j < in_zero_dims_size; j++) {
+        if (j == axis) {
+          out_dims[axis] += ins[i][j];
+        } else {
+          PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
+                            "Input tensors should have the same "
+                            "elements except the specify axis.");
+        }
+      }
+    }
+    if (out_dims[axis] < 0) {
+      out_dims[axis] = -1;
+    }
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ConcatOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input tensors of concat operator.").AsDuplicable();
+    AddOutput("Out", "Output tensor of concat operator.");
+    AddAttr<int>("axis",
+                 "The axis along which the input tensors will be concatenated.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+Concat Operator.
+
+Concatenate the input tensors along dimension axis.
+Examples:
+  Input[0] = [[1,2],[3,4]]
+  Input[1] = [[5,6]]
+  axis = 0
+  Output = [[1,2],
+            [3,4],
+            [5,6]]
+
+)DOC");
+  }
+};
+
+class ConcatOpGrad : public framework::OperatorWithKernel {
+ public:
+  ConcatOpGrad(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
+               ops::ConcatOpGrad, false)
+REGISTER_OP_CPU_KERNEL(concat,
+                       ops::ConcatKernel<paddle::platform::CPUPlace, float>)
+REGISTER_OP_CPU_KERNEL(concat_grad,
+                       ops::ConcatGradKernel<paddle::platform::CPUPlace, float>)
diff --git a/paddle/fluid/operators/concat_op.cu.cc b/paddle/fluid/operators/concat_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..143bda6116775611e399ad805708474621d33b96
--- /dev/null
+++ b/paddle/fluid/operators/concat_op.cu.cc
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/concat_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    concat, ops::ConcatKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    concat_grad,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..72b3e225bf64f889804eb5e4fab9df4653f5452b
--- /dev/null
+++ b/paddle/fluid/operators/concat_op.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ConcatKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
+    const size_t n = ins.size();
+    size_t output_offset = 0;
+    out->mutable_data<T>(ctx.GetPlace());
+    auto out_stride = framework::stride(out->dims());
+    for (size_t i = 0; i < n; i++) {
+      auto& in = ins[i];
+      auto axis_dim = in->dims()[axis];
+      auto in_stride = framework::stride(in->dims());
+      StridedMemcpy<T>(ctx.device_context(), in->data<T>(), in_stride,
+                       in->dims(), out_stride, out->data<T>() + output_offset);
+      output_offset += axis_dim * in_stride[axis];
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ConcatGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
+    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
+    const size_t n = outs.size();
+    size_t input_offset = 0;
+    auto in_stride = framework::stride(in->dims());
+    for (size_t i = 0; i < n; i++) {
+      auto& out = outs[i];
+      out->mutable_data<T>(ctx.GetPlace());
+      size_t axis_dim = out->dims()[axis];
+      auto out_stride = framework::stride(out->dims());
+      StridedMemcpy<T>(ctx.device_context(), in->data<T>() + input_offset,
+                       in_stride, out->dims(), out_stride, out->data<T>());
+      input_offset += axis_dim * in_stride[axis];
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/cond_op.cc b/paddle/fluid/operators/cond_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dd93790d5b52a2ccc8358a94f7ead346d384f191
--- /dev/null
+++ b/paddle/fluid/operators/cond_op.cc
@@ -0,0 +1,235 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/cond_op.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/scatter.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+using Scope = framework::Scope;
+using Variable = framework::Variable;
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DDim = framework::DDim;
+
+framework::Scope& CondOp::AddSubScope(const Scope& scope) const {
+  auto sub_scopes_var = scope.FindVar("SubScopes");
+  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
+                          "Output(SubScopes) of CondOp should not be null.");
+  auto sub_scopes = sub_scopes_var->GetMutable<std::vector<Scope*>>();
+  auto& sub_scope = scope.NewScope();
+  sub_scopes->push_back(&sub_scope);
+  return sub_scope;
+}
+
+std::vector<framework::Scope*>& CondOp::GetSubScopes(
+    const framework::Scope& scope) const {
+  auto sub_scopes_var = scope.FindVar("SubScopes");
+  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
+                          "Output(SubScopes) of CondOp should not be null.");
+  return *sub_scopes_var->GetMutable<std::vector<framework::Scope*>>();
+}
+
+LoDTensor& CondOp::AddIndexTensor(const Scope& scope) const {
+  auto index_tensors_var = scope.FindVar("IndexTensors");
+  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
+                          "Output(IndexTensors) of CondOp should not be null.");
+  auto& index_tensors =
+      *index_tensors_var->GetMutable<std::vector<LoDTensor>>();
+  index_tensors.push_back(LoDTensor());
+  return index_tensors.back();
+}
+
+std::vector<framework::LoDTensor>& CondOp::GetIndexTensors(
+    const framework::Scope& scope) const {
+  auto* index_tensors_var = scope.FindVar("IndexTensors");
+  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
+                          "Output(IndexTensors) of CondOp should not be null.");
+  return *index_tensors_var->GetMutable<std::vector<framework::LoDTensor>>();
+}
+
+void CondOp::PrepareDataForSubnet(
+    const framework::Scope& scope,
+    const platform::DeviceContext& dev_ctx) const {
+  PADDLE_ENFORCE(!Inputs("Xs").empty(), "Inputs(Xs) of CondOp can't be empty.");
+
+  for (int i = 0; i < BRANCH_NUM; ++i) {
+    // Create two sub scopes for true and false branches
+    //   sub_scopes[0] for the true branch
+    //   sub_scopes[1] for the false branch
+    AddSubScope(scope);
+    // Create two tensors for true and false indices:
+    //   index_tensors[0] for the true branch
+    //   index_tensors[1] for the false branch
+    AddIndexTensor(scope);
+  }
+
+  Variable* cond_var = scope.FindVar(Input("Cond"));
+  PADDLE_ENFORCE_NOT_NULL(cond_var,
+                          "Input(Cond) of CondOp should not be null.");
+  const LoDTensor* cond = cond_var->GetMutable<LoDTensor>();
+
+  // get the true/false index at runtime according to cond tensor
+  // index_vectors[0]: vector<int>, contains all index for cond[i] == true
+  // index_vectors[1]: vector<int>, contains all index for cond[i] == false
+  std::vector<std::vector<int>> index_vectors;
+  index_vectors.resize(BRANCH_NUM);
+
+  const int* cond_data = cond->data<int>();
+  for (int i = 0; i < cond->dims()[0]; ++i) {
+    if (cond_data[i])
+      index_vectors[TRUE_BRANCH].push_back(i);
+    else
+      index_vectors[FALSE_BRANCH].push_back(i);
+  }
+
+  // put index_vectors[0] and index_vectors[1] into two tensors:
+  // index_tensors[0] and index_tensors[1]
+  std::vector<framework::LoDTensor>& index_tensors = GetIndexTensors(scope);
+  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
+
+  for (int i = 0; i < BRANCH_NUM; ++i) {
+    DDim dim = {static_cast<int64_t>(index_vectors[i].size())};
+    int* index_tensor_data_ptr =
+        index_tensors[i].mutable_data<int>(dim, platform::CPUPlace());
+    memcpy(index_tensor_data_ptr, index_vectors[i].data(),
+           dim[0] * sizeof(int));
+  }
+
+  // create input in subscopes according to index_vectors
+  for (auto& input : Inputs("Xs")) {
+    Variable* var_parent = scope.FindVar(input);
+    PADDLE_ENFORCE_NOT_NULL(var_parent);
+    const auto* tensor_parent = &var_parent->Get<LoDTensor>();
+
+    for (int i = 0; i < BRANCH_NUM; ++i) {
+      Variable* var_child = sub_scopes[i]->FindVar(input);
+      PADDLE_ENFORCE_NOT_NULL(var_child);
+      auto* tensor_child = var_child->GetMutable<LoDTensor>();
+
+      // Resize child
+      DDim dim = tensor_parent->dims();
+      dim[0] = index_tensors[i].dims()[0];
+      tensor_child->mutable_data<float>(dim, platform::CPUPlace());
+
+      CPUGather<float>(dev_ctx, *tensor_parent, index_tensors[i], tensor_child);
+    }
+  }
+
+  // create output_tensors in subscope for sub_net
+  for (int i = 0; i < BRANCH_NUM; ++i) {
+    for (auto& output : (*sub_net_op_[i]).Outputs()) {
+      for (auto& var_name : output.second) {
+        sub_scopes[i]->Var(var_name);
+      }
+    }
+  }
+}
+
+void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
+                                 const platform::DeviceContext& dev_ctx) const {
+  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
+  const std::vector<framework::LoDTensor>& index_tensors =
+      GetIndexTensors(scope);
+
+  // Infer the output dim, out_dim[0] = true_dim[0] + false_dim[0]
+  PADDLE_ENFORCE(!Outputs("Outs").empty(),
+                 "Outputs(Outs) of CondOp can't be empty.");
+  for (auto& output : Outputs("Outs")) {
+    const LoDTensor* tensor_t_out =
+        &sub_scopes[TRUE_BRANCH]->FindVar(output)->Get<LoDTensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL");
+    const LoDTensor* tensor_f_out =
+        &sub_scopes[FALSE_BRANCH]->FindVar(output)->Get<LoDTensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL");
+
+    auto* var_out = scope.FindVar(output);
+    PADDLE_ENFORCE_NOT_NULL(var_out, "Output not found");
+    LoDTensor* tensor_out = var_out->GetMutable<LoDTensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_t_out,
+                            "True output tensor should not be NULL");
+
+    DDim true_dim = tensor_t_out->dims();
+    DDim false_dim = tensor_f_out->dims();
+    true_dim[0] = 0;
+    false_dim[0] = 0;
+    PADDLE_ENFORCE_EQ(true_dim, false_dim,
+                      "Outputs not of the same shape except the first dim");
+
+    DDim out_dim = tensor_t_out->dims();
+    out_dim[0] = tensor_t_out->dims()[0] + tensor_f_out->dims()[0];
+    tensor_out->Resize(out_dim);
+    tensor_out->mutable_data<float>(platform::CPUPlace());
+  }
+
+  // merge output results:
+  // output_tensor = true_output_tensor + false_output_tensor
+  for (auto& output : Outputs("Outs")) {
+    Variable* var_parent = scope.FindVar(output);
+    PADDLE_ENFORCE_NOT_NULL(var_parent);
+    auto* tensor_parent = var_parent->GetMutable<LoDTensor>();
+
+    for (int i = 0; i < BRANCH_NUM; ++i) {
+      Variable* var_child = sub_scopes[i]->FindVar(output);
+      PADDLE_ENFORCE_NOT_NULL(var_child);
+      auto* tensor_child = &var_child->Get<LoDTensor>();
+      ScatterAssign<float>(dev_ctx, *tensor_child, index_tensors[i],
+                           tensor_parent);
+    }
+  }
+}
+
+void CondOp::Run(const Scope& scope, const platform::Place& place) const {
+  // get device context from pool
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& dev_ctx = *pool.Get(place);
+
+  PrepareDataForSubnet(scope, dev_ctx);
+  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
+  for (int i = 0; i < BRANCH_NUM; ++i) {
+    sub_net_op_[i]->Run(*sub_scopes[i], place);
+  }
+  MergeDataFromSubnet(scope, dev_ctx);
+}
+
+class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CondOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Cond", "The condition, which is a bool vector");
+    AddInput("Xs", "Inputs of Subnets").AsDuplicable();
+    AddOutput("Outs", "Outputs of Cond_Op after merge").AsDuplicable();
+
+    AddOutput("SubScopes", "sub scopes for true and false branches");
+    AddOutput("IndexTensors", "Index Tensors contains indices for true/false");
+
+    AddComment(R"DOC(
+Sample Dependent Conditional Operator.
+
+Given Cond[i] as a 1/0 vector to indicate true/false:
+Out[i] = subnet_true[i], if Cond[i] == true
+Out[i] = subnet_false[i], if Cond[i] == false
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(cond, paddle::operators::CondOp,
+                             paddle::operators::CondOpProtoAndCheckerMaker);
diff --git a/paddle/fluid/operators/cond_op.h b/paddle/fluid/operators/cond_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..695af4490696b29d2d47f5825ebc0159b39663c0
--- /dev/null
+++ b/paddle/fluid/operators/cond_op.h
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/net_op.h"
+
+namespace paddle {
+namespace operators {
+
+/*
+ * @brief CondOp is a dynamic if-else Operator
+ *
+ * It has a input tensor named cond indicating which netop each instance will
+ * run.
+ *
+ * if cond == 1, it will run true_net, which is a NetOp.
+ *
+ * if cond == 0, it will run false_net, which is another NetOp.
+ */
+class CondOp : public framework::OperatorBase {
+ public:
+  CondOp(const std::string& type, const framework::VariableNameMap& inputs,
+         const framework::VariableNameMap& outputs,
+         const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+    sub_net_op_.resize(BRANCH_NUM);
+  }
+
+  CondOp(const CondOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    // TODO(yuyang18): Implement copy ctor well.
+    PADDLE_THROW("Not implemented");
+  }
+
+  framework::Scope& AddSubScope(const framework::Scope& scope) const;
+  std::vector<framework::Scope*>& GetSubScopes(
+      const framework::Scope& scope) const;
+
+  framework::LoDTensor& AddIndexTensor(const framework::Scope& scope) const;
+  std::vector<framework::LoDTensor>& GetIndexTensors(
+      const framework::Scope& scope) const;
+
+  void PrepareDataForSubnet(const framework::Scope& scope,
+                            const platform::DeviceContext& dev_ctx) const;
+  void MergeDataFromSubnet(const framework::Scope& scope,
+                           const platform::DeviceContext& dev_ctx) const;
+
+  /*
+   * Set True Block
+   */
+  void set_truenet(std::unique_ptr<OperatorBase>&& net) {
+    sub_net_op_[TRUE_BRANCH] = std::move(net);
+  }
+
+  /*
+   * Set False Block
+   */
+  void set_falsenet(std::unique_ptr<OperatorBase>&& net) {
+    sub_net_op_[FALSE_BRANCH] = std::move(net);
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::Place& place) const override;
+
+ private:
+  const int TRUE_BRANCH = 0;
+  const int FALSE_BRANCH = 1;
+  const int BRANCH_NUM = 2;
+
+  // sub_net_op_[0]: subnet_t
+  // sub_net_op_[1]: subnet_f
+  std::vector<std::unique_ptr<framework::OperatorBase>> sub_net_op_;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..30435c6cca0a4fb1d41dce47b8fefeafb6c48a51
--- /dev/null
+++ b/paddle/fluid/operators/conditional_block_op.cc
@@ -0,0 +1,229 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <algorithm>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class ConditionalOp : public framework::OperatorBase {
+ public:
+  ConditionalOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  std::vector<const framework::LoDTensor *> InputTensors(
+      const framework::Scope &scope) const {
+    std::vector<const framework::LoDTensor *> retv;
+    auto xs = Inputs("X");
+    retv.resize(xs.size(), nullptr);
+    std::transform(
+        xs.begin(), xs.end(), retv.begin(),
+        [&scope](const std::string &var_name) -> const framework::LoDTensor * {
+          auto *var = scope.FindVar(var_name);
+          PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", var_name);
+          return &var->Get<framework::LoDTensor>();
+        });
+    return retv;
+  }
+
+  bool ScalarCondition(
+      const std::vector<const framework::LoDTensor *> &ips) const {
+    if (!(ips.size() == 1UL && ips[0]->IsInitialized())) {
+      PADDLE_THROW("should have one initialized input as condition");
+    }
+    if (!(ips[0]->type().hash_code() == typeid(bool).hash_code() &&
+          ips[0]->numel() == 1)) {
+      PADDLE_THROW(
+          "condition input's data type should be bool, "
+          "numel should be 1, actual numel is %d",
+          ips[0]->numel());
+    }
+    return ips[0]->data<bool>()[0];
+  }
+};
+
+class ConditionalBlockOp : public ConditionalOp {
+ public:
+  ConditionalBlockOp(const std::string &type,
+                     const framework::VariableNameMap &inputs,
+                     const framework::VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs)
+      : ConditionalOp(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    auto xs = InputTensors(scope);
+
+    bool need_run;
+    if (Attr<bool>("is_scalar_condition")) {
+      need_run = ScalarCondition(xs);
+    } else {
+      need_run = std::all_of(
+          xs.begin(), xs.end(),
+          [](const framework::LoDTensor *t) { return t->numel() != 0; });
+    }
+
+    if (need_run) {
+      auto *scope_var = scope.FindVar(Output("Scope"));
+      PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
+      auto *scopes = scope_var->GetMutable<std::vector<framework::Scope *>>();
+      scopes->resize(1);
+      scopes->front() = &scope.NewScope();
+      auto &cur_scope = *scopes->front();
+
+      framework::Executor exec(dev_place);
+      auto *block = Attr<framework::BlockDesc *>("sub_block");
+      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
+    }
+  }
+};
+
+class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ConditionalBlockOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The conditional variable of this operator. If X is empty, the "
+             "whole sub-block will not be executed.")
+        .AsDuplicable();
+    AddInput("Params", "The input variables of the sub-block.").AsDuplicable();
+    AddOutput("Out", "The output variables of the sub-block.").AsDuplicable();
+    AddOutput("Scope",
+              "(std::vector<Scope*>) The step scope of conditional block. To "
+              "unify the conditional block, rnn and while op, the type of "
+              "scope is std::vector<Scope*>");
+    AddAttr<framework::BlockDesc *>(
+        "sub_block", "The step block of conditional block operator");
+    AddAttr<bool>("is_scalar_condition",
+                  "the input X is used as scalar "
+                  "condition")
+        .SetDefault(false);
+    AddComment(R"DOC(Conditional block operator
+
+Run the sub-block if X is not empty. Params is the other inputs and Out is the
+outputs of the sub-block.
+)DOC");
+  }
+};
+
+class ConditionalBlockGradOp : public ConditionalOp {
+ public:
+  ConditionalBlockGradOp(const std::string &type,
+                         const framework::VariableNameMap &inputs,
+                         const framework::VariableNameMap &outputs,
+                         const framework::AttributeMap &attrs)
+      : ConditionalOp(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    auto xs = this->InputTensors(scope);
+
+    bool need_run;
+    if (Attr<bool>("is_scalar_condition")) {
+      need_run = ScalarCondition(xs);
+    } else {
+      need_run = std::all_of(
+          xs.begin(), xs.end(),
+          [](const framework::LoDTensor *t) { return t->numel() != 0; });
+    }
+
+    if (need_run) {
+      auto *scope_var = scope.FindVar(Input("Scope"));
+      PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
+      auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
+      framework::Scope &cur_scope = *scopes[0];
+
+      framework::Executor exec(dev_place);
+      auto *block = Attr<framework::BlockDesc *>("sub_block");
+      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
+
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Params"),
+                                  Outputs(framework::GradVarName("Params")));
+
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("X"),
+                                  Outputs(framework::GradVarName("X")));
+    }
+  }
+
+ private:
+  void AssignLocalGradientToGlobal(
+      const platform::Place &place, const framework::Scope &cur_scope,
+      const std::vector<std::string> &p_names,
+      const std::vector<std::string> &pg_names) const {
+    for (size_t i = 0; i < p_names.size(); ++i) {
+      auto out_grad_name = pg_names[i];
+      auto in_grad_name = framework::GradVarName(p_names[i]);
+      auto *in_var = cur_scope.FindVar(in_grad_name);
+      if (in_var == nullptr) {
+        continue;
+      }
+      auto new_in_grad_name = cur_scope.Rename(in_grad_name);
+      auto assign = framework::OpRegistry::CreateOp(
+          "assign", {{"X", {new_in_grad_name}}}, {{"Out", {out_grad_name}}},
+          framework::AttributeMap{});
+      assign->Run(cur_scope, place);
+      cur_scope.Rename(new_in_grad_name, in_grad_name);
+    }
+  }
+};
+
+class ConditionalBlockGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInputs("X"));
+    if (context->HasInputs("Params")) {
+      PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Params")));
+      context->SetOutputsDim(framework::GradVarName("Params"),
+                             context->GetInputsDim("Params"));
+    }
+    PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("X")));
+    context->SetOutputsDim(framework::GradVarName("X"),
+                           context->GetInputsDim("X"));
+  }
+};
+
+class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto grad_op = new framework::OpDesc();
+    grad_op->SetType("conditional_block_grad");
+    grad_op->SetInput("X", Input("X"));
+    grad_op->SetInput("Params", Input("Params"));
+    grad_op->SetInput("Out", Output("Out"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetInput("Scope", Output("Scope"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
+    grad_op->SetOutput(framework::GradVarName("Params"),
+                       InputGrad("Params", false));
+    grad_op->SetBlockAttr("sub_block", *this->grad_block_[0]);
+    grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(conditional_block, ops::ConditionalBlockOp,
+                  ops::ConditionalBlockOpProtoMaker,
+                  ops::ConditionalBlockGradMaker);
+REGISTER_OPERATOR(conditional_block_grad, ops::ConditionalBlockGradOp,
+                  ops::ConditionalBlockGradInferShape);
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a729d376ac8c3dc49ec06271c3ffef6406a20b28
--- /dev/null
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -0,0 +1,330 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/conv_op.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
+using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
+using DataLayout = platform::DataLayout;
+
+static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
+    static_cast<size_t>(1024) * 1024 * 1024;
+
+template <typename T>
+class CUDNNConvOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* output = ctx.Output<Tensor>("Output");
+
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
+    int64_t user_workspace_size =
+        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
+
+    const T* input_data = input->data<T>();
+    const T* filter_data = filter->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedFilterDescriptor filter_desc;
+    ScopedConvolutionDescriptor conv_desc;
+    DataLayout layout = DataLayout::kNCHW;
+    if (input->dims().size() == 5) {
+      layout = DataLayout::kNCDHW;
+    }
+
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+
+#if CUDNN_VERSION_MIN(7, 0, 1)
+    // cudnn 7 can support groups, no need to do it mannually
+    // FIXME(typhoonzero): find a better way to disable groups
+    // rather than setting it to 1.
+    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+        cudnn_conv_desc, groups));
+    groups = 1;
+#endif
+
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()), groups);
+    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+        layout, framework::vectorize2int(output->dims()), groups);
+    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
+        layout, framework::vectorize2int(filter->dims()), groups);
+
+    int input_channels = input->dims()[1];
+    int input_height, input_width, input_depth;
+    if (input->dims().size() == 5) {
+      input_depth = input->dims()[2];
+      input_height = input->dims()[3];
+      input_width = input->dims()[4];
+    } else {  // dim size is enforced in InferShape
+      input_depth = 1;
+      input_height = input->dims()[2];
+      input_width = input->dims()[3];
+    }
+    int output_channels = filter->dims()[0];
+    int output_height, output_width, output_depth;
+    if (output->dims().size() == 5) {
+      output_depth = output->dims()[2];
+      output_height = output->dims()[3];
+      output_width = output->dims()[4];
+    } else {
+      output_depth = 1;
+      output_height = output->dims()[2];
+      output_width = output->dims()[3];
+    }
+
+    int group_offset_in =
+        input_channels / groups * input_height * input_width * input_depth;
+    int group_offset_out =
+        output_channels / groups * output_height * output_width * output_depth;
+    int group_offset_filter = filter->numel() / groups;
+    // ------------------- cudnn conv workspace ---------------------
+    void* cudnn_workspace = nullptr;
+    size_t workspace_size_in_bytes;  // final workspace to allocate.
+    size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
+    if (user_workspace_size > 0) {
+      workspace_size_limit = user_workspace_size * 1024 * 1024;
+    }
+    // ------------------- cudnn conv algorithm ---------------------
+    cudnnConvolutionFwdAlgo_t algo;
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+
+    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+        cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        workspace_size_limit, &algo));
+    // get workspace size able to allocate
+    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+        cudnn_output_desc, algo, &workspace_size_in_bytes));
+    // Allocate on GPU memory
+    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
+    // ------------------- cudnn conv forward ---------------------
+    T alpha = 1.0f, beta = 0.0f;
+    for (int i = 0; i < groups; i++) {
+      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
+          handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
+          cudnn_filter_desc, filter_data + i * group_offset_filter,
+          cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
+          &beta, cudnn_output_desc, output_data + i * group_offset_out));
+    }
+    // Release the cudnn workspace
+    paddle::memory::Free(gpu, cudnn_workspace);
+  }
+};
+
+template <typename T>
+class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto input = ctx.Input<Tensor>("Input");
+    auto filter = ctx.Input<Tensor>("Filter");
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+
+    const T* input_data = input->data<T>();
+    const T* output_grad_data = output_grad->data<T>();
+    const T* filter_data = filter->data<T>();
+
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
+    int64_t user_workspace_size =
+        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
+
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_grad_desc;
+
+    ScopedFilterDescriptor filter_desc;
+    ScopedFilterDescriptor filter_grad_desc;
+    ScopedConvolutionDescriptor conv_desc;
+    DataLayout layout = DataLayout::kNCHW;
+    if (input->dims().size() == 5) {
+      layout = DataLayout::kNCDHW;
+    }
+
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+
+#if CUDNN_VERSION_MIN(7, 0, 1)
+    // cudnn 7 can support groups, no need to do it mannually
+    // FIXME(typhoonzero): find a better way to disable groups
+    // rather than setting it to 1.
+    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+        cudnn_conv_desc, groups));
+    groups = 1;
+#endif
+
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()), groups);
+    cudnnTensorDescriptor_t cudnn_output_grad_desc =
+        output_grad_desc.descriptor<T>(
+            layout, framework::vectorize2int(output_grad->dims()), groups);
+    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
+        layout, framework::vectorize2int(filter->dims()), groups);
+
+    int input_channels = input->dims()[1];
+    int input_height, input_width, input_depth;
+    if (input->dims().size() == 5) {
+      input_depth = input->dims()[2];
+      input_height = input->dims()[3];
+      input_width = input->dims()[4];
+    } else {  // dim size is enforced in InferShape
+      input_depth = 1;
+      input_height = input->dims()[2];
+      input_width = input->dims()[3];
+    }
+
+    int output_grad_channels = filter->dims()[0];
+    int output_grad_height, output_grad_width, output_grad_depth;
+    if (input->dims().size() == 5) {
+      output_grad_depth = output_grad->dims()[2];
+      output_grad_height = output_grad->dims()[3];
+      output_grad_width = output_grad->dims()[4];
+    } else {
+      output_grad_depth = 1;
+      output_grad_height = output_grad->dims()[2];
+      output_grad_width = output_grad->dims()[3];
+    }
+
+    int group_offset_in =
+        input_channels / groups * input_height * input_width * input_depth;
+    int group_offset_out = output_grad_channels / groups * output_grad_height *
+                           output_grad_width * output_grad_depth;
+    int group_offset_filter = filter->numel() / groups;
+    // ------------------- cudnn backward algorithm ---------------------
+    cudnnConvolutionBwdDataAlgo_t data_algo;
+    cudnnConvolutionBwdFilterAlgo_t filter_algo;
+    size_t workspace_size_in_bytes = 0, tmp_size = 0;
+    size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
+    if (user_workspace_size > 0) {
+      workspace_size_limit = user_workspace_size * 1024 * 1024;
+    }
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    if (input_grad) {
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+              handle, cudnn_filter_desc,
+              // dyDesc: Handle to the previously initialized input differential
+              // tensor descriptor.
+              cudnn_output_grad_desc, cudnn_conv_desc,
+              // dxDesc: Handle to the previously initialized output tensor
+              // descriptor.
+              cudnn_input_desc,
+              CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+              workspace_size_limit, &data_algo));
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+              handle, cudnn_filter_desc, cudnn_output_grad_desc,
+              cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
+      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
+    }
+
+    if (filter_grad) {
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+              handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
+              cudnn_filter_desc,
+              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+              workspace_size_limit, &filter_algo));
+
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+              handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
+              cudnn_filter_desc, filter_algo, &tmp_size));
+      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
+    }
+    // ------------------- cudnn conv workspace ---------------------
+    // Already on GPU
+    void* cudnn_workspace = nullptr;
+    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
+    // ------------------- cudnn conv backward data ---------------------
+    T alpha = 1.0f, beta = 0.0f;
+    if (input_grad) {
+      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+      // Because beta is zero, it is unnecessary to reset input_grad.
+
+      for (int i = 0; i < groups; i++) {
+        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+            handle, &alpha, cudnn_filter_desc,
+            filter_data + i * group_offset_filter, cudnn_output_grad_desc,
+            output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
+            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
+            input_grad_data + i * group_offset_in));
+      }
+    }
+    // ------------------- cudnn conv backward filter ---------------------
+    if (filter_grad) {
+      T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
+      // Because beta is zero, it is unnecessary to reset filter_grad.
+      for (int i = 0; i < groups; i++) {
+        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+            handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
+            cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
+            cudnn_conv_desc, filter_algo, cudnn_workspace,
+            workspace_size_in_bytes, &beta, cudnn_filter_desc,
+            filter_grad_data + i * group_offset_filter));
+      }
+    }
+    // Release the cudnn workspace
+    paddle::memory::Free(gpu, cudnn_workspace);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace,
+                   paddle::operators::CUDNNConvOpKernel<float>,
+                   paddle::operators::CUDNNConvOpKernel<double>);
+REGISTER_OP_KERNEL(conv2d_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   paddle::operators::CUDNNConvGradOpKernel<float>,
+                   paddle::operators::CUDNNConvGradOpKernel<double>);
+
+REGISTER_OP_KERNEL(conv3d, CUDNN, ::paddle::platform::CUDAPlace,
+                   paddle::operators::CUDNNConvOpKernel<float>,
+                   paddle::operators::CUDNNConvOpKernel<double>);
+REGISTER_OP_KERNEL(conv3d_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   paddle::operators::CUDNNConvGradOpKernel<float>,
+                   paddle::operators::CUDNNConvGradOpKernel<double>);
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a047e579163cfe9cd0d053f337b7a92339466a96
--- /dev/null
+++ b/paddle/fluid/operators/conv_op.cc
@@ -0,0 +1,354 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/conv_op.h"
+
+namespace paddle {
+namespace operators {
+
+void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("Input"),
+                 "Input(Input) of ConvOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                 "Input(Filter) of ConvOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                 "Output(Output) of ConvOp should not be null.");
+
+  auto in_dims = ctx->GetInputDim("Input");
+  auto filter_dims = ctx->GetInputDim("Filter");
+  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+  int groups = ctx->Attrs().Get<int>("groups");
+  std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
+
+  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
+                 "Conv intput should be 4-D or 5-D tensor.");
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(), filter_dims.size(),
+      "Conv input dimension and filter dimension should be the same.");
+  PADDLE_ENFORCE(
+      in_dims.size() - strides.size() == 2U,
+      "Conv input dimension and strides dimension should be consistent.");
+  PADDLE_ENFORCE_EQ(
+      paddings.size(), strides.size(),
+      "Conv paddings dimension and Conv strides dimension should be the same.");
+
+  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[1] * groups,
+                    "The number of input channels should be equal to filter "
+                    "channels * groups.");
+
+  PADDLE_ENFORCE_EQ(
+      filter_dims[0] % groups, 0,
+      "The number of output channels should be divided by groups.");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    PADDLE_ENFORCE(in_dims[i + 2] + 2 * paddings[i] -
+                           (dilations[i] * (filter_dims[i + 2] - 1) + 1) >
+                       0,
+                   "Due to the settings of paddings, filter_dims and "
+                   "dilations, the output size is less than 0, please check "
+                   "again.");
+    output_shape.push_back(OutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                      dilations[i], paddings[i], strides[i]));
+  }
+  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
+  ctx->ShareLoD("Input", "Output");
+}
+
+framework::OpKernelType ConvOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
+  framework::LibraryType library_;
+  if (use_cudnn) {
+    library_ = framework::LibraryType::kCUDNN;
+  } else {
+    library_ = framework::LibraryType::kPlain;
+  }
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+
+Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "Input",
+      "(Tensor) The input tensor of convolution operator. "
+      "The format of input tensor is NCHW, where N is batch size, C is the "
+      "number of channels, H is the height of the feature, "
+      "and W is the width of the feature.");
+  AddInput("Filter",
+           "(Tensor) The filter tensor of convolution operator. "
+           "The format of the filter tensor is MCHW, where M is the number of "
+           "output image channels, C is the number of input image channels, "
+           "H is the height of the filter, and W is the width of the filter. "
+           "If the groups attribute is greater than 1, C equals the number of "
+           "input image channels divided by the groups.");
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution operator. "
+            "The format of output tensor is also NCHW.");
+  AddAttr<std::vector<int>>("strides",
+                            "(vector<int> default:{1, 1}), the "
+                            "strides(h_stride, w_stride) of "
+                            "convolution operator.")
+      .SetDefault({1, 1});
+  AddAttr<std::vector<int>>("paddings",
+                            "(vector<int> default:{0, 0}), the "
+                            "paddings(h_pad, w_pad) of "
+                            "convolution operator.")
+      .SetDefault({0, 0});
+  AddAttr<int>(
+      "groups",
+      "(int default:1), the groups number of the convolution operator. "
+      "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
+      "when group=2, the first half of the filters is only connected to the "
+      "first half of the input channels, while the second half of the filters "
+      "is only connected to the second half of the input channels.")
+      .SetDefault(1);
+  AddAttr<std::vector<int>>("dilations",
+                            "(vector<int> default:{1, 1}), the "
+                            "dilations(h_dilation, w_dilation) of "
+                            "convolution operator.")
+      .SetDefault({1, 1});
+  AddAttr<bool>(
+      "use_cudnn",
+      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      .SetDefault(false);
+  AddAttr<std::string>(
+      "data_format",
+      "(string, default NCHW) Only used in "
+      "An optional string from: \"NHWC\", \"NCHW\". "
+      "Defaults to \"NHWC\". Specify the data format of the output data, "
+      "the input will be transformed automatically. ")
+      .SetDefault("AnyLayout");
+  // TODO(dzhwinter): need to registered layout transform function
+  AddAttr<int>("workspace_size_MB",
+               "Only used in cudnn kernel. Need set use_cudnn to true."
+               "workspace size for cudnn, in MB, "
+               "workspace is a section of GPU memory which will be "
+               "allocated/freed each time the operator runs, larger "
+               "workspace size can increase performance but also requires "
+               "better hardware. This size should be chosen carefully.")
+      .SetDefault(4096);
+  AddComment(R"DOC(
+Convolution Operator.
+
+The convolution operation calculates the output based on the input, filter
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input) and Output(Output) are in NCHW format. Where N is batch
+size, C is the number of channels, H is the height of the feature, and W is
+the width of the feature.
+Filters(Input) is MCHW format. Where M is the number of output image channels, C is
+the number of input image channels, H is the height of the filter, and W
+is the width of the filter.
+Parameters(strides, paddings, dilations) are two elements. These two elements represent
+height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       Input shape: $(N, C_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+  Output:
+       Output shape: $(N, C_{out}, H_{out}, W_{out})$
+  Where
+$$
+       H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\
+       W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1
+$$
+)DOC");
+}
+
+Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "Input",
+      "(Tensor) The input tensor of convolution operator. "
+      "The format of input tensor is NCDHW. Where N is batch size, C is the "
+      "number of channels, D is the depth of the feature, H is the height of "
+      "the feature, "
+      "and W is the width of the feature.");
+  AddInput("Filter",
+           "(Tensor) The filter tensor of convolution operator. "
+           "The format of the filter tensor is MCDHW, where M is the number of "
+           "output image channels, C is the number of input image channels, "
+           "D is the depth of the filter, H is the height of the filter, and W "
+           "is the width of the filter."
+           "If the groups attribute is greater than 1, C equals the number of "
+           "input image channels divided by the groups.");
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution operator."
+            "The format of output tensor is also NCDHW.");
+  AddAttr<std::vector<int>>("strides",
+                            "(vector<int>, default:{1, 1, 1}), the "
+                            "strides(d_stride, h_stride, w_stride) of "
+                            "convolution operator.")
+      .SetDefault({1, 1, 1});
+  AddAttr<std::vector<int>>("paddings",
+                            "(vector<int>, default:{0, 0, 0}), the "
+                            "paddings(d_pad, h_pad, w_pad) of convolution "
+                            "operator.")
+      .SetDefault({0, 0, 0});
+  AddAttr<int>(
+      "groups",
+      "(int default:1), the groups number of the convolution operator. "
+      "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
+      "when group=2, the first half of the filters is only connected to the "
+      "first half of the input channels, while the second half of the filters "
+      "is only connected to the second half of the input channels.")
+      .SetDefault(1);
+  AddAttr<std::vector<int>>("dilations",
+                            "(vector<int> default:{1, 1, 1}), the "
+                            "dilations(d_dilation, h_dilation, w_dilation) of "
+                            "convolution operator.")
+      .SetDefault({1, 1, 1});
+  AddAttr<bool>(
+      "use_cudnn",
+      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      .SetDefault(false);
+  AddAttr<std::string>(
+      "data_format",
+      "(string, default NCHW) Only used in "
+      "An optional string from: \"NHWC\", \"NCHW\". "
+      "Defaults to \"NHWC\". Specify the data format of the output data, "
+      "the input will be transformed automatically. ")
+      .SetDefault("AnyLayout");
+  // TODO(dzhwinter): need to registered layout transform function
+  AddAttr<int>("workspace_size_MB",
+               "Only used in cudnn kernel. workspace size for cudnn, in MB, "
+               "workspace is a section of GPU memory which will be "
+               "allocated/freed each time the operator runs, larger "
+               "workspace size can increase performance but also requires "
+               "better hardware. This size should be chosen carefully.")
+      .SetDefault(4096);
+
+  AddComment(R"DOC(
+Convolution3D Operator.
+
+The convolution operation calculates the output based on the input, filter
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input) and output(Output) are in NCDHW format, where N is batch
+size, C is the number of channels,D is the depth of the feature, H is the height of
+the feature, and W is the width of the feature.
+Filters(Input) is MCDHW format, where M is the number of output image channels,
+C is the number of input image channels, D is the depth of the filter,
+H is the height of the filter, and W is the width of the filter.
+Parameters(strides, paddings, dilations) are three elements. These three elements
+represent depth, height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{out}, C_{in}, D_f, H_f, W_f)$
+  Output:
+       Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       D_{out}= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{ strides[0]}+ 1 \\
+       H_{out}= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{ strides[1]}+ 1 \\
+       W_{out}= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{ strides[2]}+ 1
+  $$
+)DOC");
+}
+
+void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
+  auto in_dims = ctx->GetInputDim("Input");
+  auto filter_dims = ctx->GetInputDim("Filter");
+  if (ctx->HasOutput(framework::GradVarName("Input"))) {
+    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
+  }
+  if (ctx->HasOutput(framework::GradVarName("Filter"))) {
+    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
+  }
+}
+
+framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
+
+  framework::LibraryType library_;
+  if (use_cudnn) {
+    library_ = framework::LibraryType::kCUDNN;
+  } else {
+    library_ = framework::LibraryType::kPlain;
+  }
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
+            ops::ConvOpGrad);
+
+// depthwise convolution op
+REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+            depthwise_conv2d_grad, ops::ConvOpGrad);
+REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
+            ops::ConvOpGrad);
+
+// depthwise conv kernel
+// TODO(xingzhaolong): neon kernel for mobile
+REGISTER_OP_CPU_KERNEL(
+    depthwise_conv2d,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    depthwise_conv2d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv2d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    conv3d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv3d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/conv_op.cu.cc b/paddle/fluid/operators/conv_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b2129d3b461249b5d1b317edde924ffc04f4f90f
--- /dev/null
+++ b/paddle/fluid/operators/conv_op.cu.cc
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/conv_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    depthwise_conv2d,
+    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    depthwise_conv2d_grad,
+    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv2d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    conv3d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv3d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1156e6c8fe3263607d4dcd1af0c9996acd9368fb
--- /dev/null
+++ b/paddle/fluid/operators/conv_op.h
@@ -0,0 +1,422 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/depthwise_conv.h"
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+// Base convolution operator definations for other conv
+// like operators to reuse the implementation.
+inline int OutputSize(int input_size, int filter_size, int dilation,
+                      int padding, int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  const int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  return output_size;
+}
+inline bool IsExpand(std::vector<int64_t>& filter_dim,
+                     std::vector<int>& strides, std::vector<int>& paddings,
+                     std::vector<int>& dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+
+// Define Op classes in .h file so that other conv
+// operator implementations can reuse the code.
+class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+};
+
+class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+};
+
+class ConvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class ConvOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+template <typename DeviceContext, typename T>
+class GemmConvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    // The filter will be reshaped in the calculations,
+    // so here use an assignment operation,
+    // that avoids modifying the variable in the Scope.
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+    output->mutable_data<T>(context.GetPlace());
+
+    int groups = context.Attr<int>("groups");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+
+    const int batch_size = static_cast<int>(input->dims()[0]);
+
+    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
+    std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+
+    // use col_shape in the im2col calculation
+    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
+    // o_h, o_w}
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = input->dims()[1] / groups;
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+    }
+    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+    // use col_matrix_shape in the gemm calculation
+    // size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d *
+    // o_h * o_w)
+    framework::DDim col_matrix_shape =
+        framework::flatten_to_2d(col_shape, data_dim + 1);
+
+    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+    Tensor col;
+    // col_matrix shares the same piece of data with col,
+    // but will be reshaped into a two-dimensional matrix shape
+    // to call the matrix multiplication interface.
+    Tensor col_matrix;
+    if (is_expand) {
+      col.mutable_data<T>(col_shape, context.GetPlace());
+      col_matrix.ShareDataWith(col);
+      col_matrix.Resize(col_matrix_shape);
+    }
+
+    framework::DDim input_shape = framework::slice_ddim(
+        input->dims(), 1, static_cast<int>(input->dims().size()));
+
+    framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                           filter.numel() / filter.dims()[0]};
+    filter.Resize(filter_matrix_shape);
+
+    framework::DDim output_matrix_shape = {
+        output->dims()[1],
+        output->numel() / (output->dims()[0] * output->dims()[1])};
+
+    // convolution operator: im2col(or vol2col) + gemm
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
+    int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+    math::Vol2ColFunctor<DeviceContext, T> vol2col;
+    math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    for (int i = 0; i < batch_size; i++) {
+      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+      Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+      for (int g = 0; g < groups; g++) {
+        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+        if (!is_expand) {
+          col.ShareDataWith(in_slice);
+          col_matrix.ShareDataWith(col);
+          col_matrix.Resize(col_matrix_shape);
+        } else if (data_dim == 2U) {
+          // im2col
+          im2col(dev_ctx, in_slice, dilations, strides,
+                 std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                  paddings[1]},
+                 &col);
+        } else if (data_dim == 3U) {
+          // vol2col
+          vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
+        }
+
+        // gemm
+        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+        Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+        math::matmul<DeviceContext, T>(dev_ctx, filter_slice, false, col_matrix,
+                                       false, T(1.0), &out_slice, T(0.0));
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GemmConvGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+    // The filter and filter_grad will be reshaped in the calculations,
+    // so here use an assignment operation,
+    // that avoids modifying the variable in the Scope.
+    Tensor filter = *context.Input<Tensor>("Filter");
+
+    if (!input_grad && !filter_grad) return;
+
+    int groups = context.Attr<int>("groups");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+
+    const int batch_size = static_cast<int>(input->dims()[0]);
+
+    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
+    std::vector<int64_t> output_shape_vec(
+        framework::vectorize(output_grad->dims()));
+
+    // use col_shape in the im2col calculation
+    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
+    // o_h, o_w}
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = input->dims()[1] / groups;
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+    }
+    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+    // use col_matrix_shape in the gemm calculation
+    // size: (i_c/g * k_h * k_w, o_h * o_w)
+    // or
+    // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
+    framework::DDim col_matrix_shape =
+        framework::flatten_to_2d(col_shape, data_dim + 1);
+
+    framework::DDim input_shape = framework::slice_ddim(
+        input->dims(), 1, static_cast<int>(input->dims().size()));
+
+    framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                           filter.numel() / filter.dims()[0]};
+    filter.Resize(filter_matrix_shape);
+
+    framework::DDim output_matrix_shape = {
+        output_grad->dims()[1],
+        output_grad->numel() /
+            (output_grad->dims()[0] * output_grad->dims()[1])};
+
+    // convolution backward input operator:  gemm + col2im(or col2vol)
+    // convolution backward weight operator: im2col(or vol2col) + gemm
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
+    int out_step = static_cast<int>(output_grad->dims()[1]) / groups;
+
+    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+    Tensor col;
+    // col_matrix shares the same piece of data with col,
+    // but will be reshaped into a two-dimensional matrix shape
+    // to call the matrix multiplication interface.
+    Tensor col_matrix;
+    if (is_expand) {
+      col.mutable_data<T>(col_shape, context.GetPlace());
+      col_matrix.ShareDataWith(col);
+      col_matrix.Resize(col_matrix_shape);
+    }
+
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+
+      // if is_expand is false, the operation of set_zero is unnecessary,
+      // because math::matmul will reset input_grad.
+      if (is_expand) {
+        set_zero(dev_ctx, input_grad, static_cast<T>(0));
+      }
+      math::Col2VolFunctor<DeviceContext, T> col2vol;
+      math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
+
+      for (int i = 0; i < batch_size; i++) {
+        Tensor out_grad_batch =
+            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
+        Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape);
+        for (int g = 0; g < groups; g++) {
+          // gemm
+          Tensor out_grad_slice =
+              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+          Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+
+          Tensor in_grad_slice =
+              in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
+
+          if (!is_expand) {
+            col_matrix.ShareDataWith(in_grad_slice);
+            col_matrix.Resize(col_matrix_shape);
+          }
+          math::matmul<DeviceContext, T>(dev_ctx, filter_slice, true,
+                                         out_grad_slice, false, T(1.0),
+                                         &col_matrix, T(0.0));
+
+          if (is_expand && data_dim == 2U) {
+            col2im(dev_ctx, col, dilations, strides,
+                   std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                    paddings[1]},
+                   &in_grad_slice);
+          } else if (is_expand && data_dim == 3U) {
+            col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice);
+          }
+        }
+      }
+    }
+
+    if (filter_grad) {
+      filter_grad->mutable_data<T>(context.GetPlace());
+      Tensor filter_grad_ = *filter_grad;
+      filter_grad_.Resize(filter_matrix_shape);
+      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
+      math::Vol2ColFunctor<DeviceContext, T> vol2col;
+      for (int i = 0; i < batch_size; i++) {
+        Tensor out_grad_batch =
+            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
+        Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+        for (int g = 0; g < groups; g++) {
+          // im2col
+          Tensor out_grad_slice =
+              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+          Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+          if (!is_expand) {
+            col.ShareDataWith(in_slice);
+            col_matrix.ShareDataWith(col);
+            col_matrix.Resize(col_matrix_shape);
+          } else if (data_dim == 2U) {
+            im2col(dev_ctx, in_slice, dilations, strides,
+                   std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                    paddings[1]},
+                   &col);
+          } else if (data_dim == 3U) {
+            vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
+          }
+
+          // gemm
+          Tensor filter_grad_slice =
+              filter_grad_.Slice(g * out_step, (g + 1) * out_step);
+          math::matmul<DeviceContext, T>(dev_ctx, out_grad_slice, false,
+                                         col_matrix, true, T(1.0),
+                                         &filter_grad_slice, T(1.0));
+        }
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+    output->mutable_data<T>(context.GetPlace());
+
+    PADDLE_ENFORCE_EQ(
+        output->dims()[1] % input->dims()[1], 0,
+        "The output channels must be a multiple of the input channels");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+
+    math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    depthwiseConv(dev_ctx, *input, filter, strides, paddings, output);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+    Tensor filter = *context.Input<Tensor>("Filter");
+
+    if (!input_grad && !filter_grad) return;
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
+        depthwiseConvInputGrad;
+    math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
+        depthwiseConvFilterGrad;
+
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, input_grad, static_cast<T>(0));
+      depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
+                             paddings, input_grad);
+    }
+
+    if (filter_grad) {
+      filter_grad->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+      depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
+                              filter_grad);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/conv_shift_op.cc b/paddle/fluid/operators/conv_shift_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a96aac63e09b47c9afe99b2e622a718839ba047c
--- /dev/null
+++ b/paddle/fluid/operators/conv_shift_op.cc
@@ -0,0 +1,202 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/conv_shift_op.h"
+#include "paddle/fluid/framework/eigen.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+class ConvShiftOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(y_dims.size(), 2, "Input(Y)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
+                      "The 1st dimension of Input(X) and Input(Y) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(y_dims[1] % 2, 1,
+                      "The 2nd dimension of Input(Y) should be odd.");
+    PADDLE_ENFORCE_LE(y_dims[1], x_dims[1],
+                      "The 2nd dimension of Input(Y) should be less than or "
+                      "equal to the 2nd dimension of Input(X).");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class ConvShiftGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should be not null.");
+
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      auto x_dims = ctx->GetInputDim("X");
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(y_grad_name)) {
+      auto y_dims = ctx->GetInputDim("Y");
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+class ConvShiftOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ConvShiftOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "
+             "where B is the batch size and M is the data dimension.");
+    AddInput("Y",
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape B x N, "
+             "where B is the batch size and N is the data dimension. N must "
+             "be odd.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "
+              "i.e., the same shape as X.");
+    AddComment(R"DOC(
+ConvShift Operator.
+
+A layer for circular convolution of two vectors,
+as used in the Neural Turing Machine: https://arxiv.org/abs/1410.5401
+
+The equation is:
+
+$$Out[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} X_{i+j} * Y_{j}$$
+
+where X's index is computed modulo M, and Y's index is computed modulo N.
+
+Both inputs X and Y can carry LoD (Level of Details) information.
+However, the output only shares the LoD information with input X.
+
+)DOC");
+  }
+};
+
+template <typename T>
+class ConvShiftKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *X = context.Input<Tensor>("X");
+    auto *Y = context.Input<Tensor>("Y");
+    auto *Out = context.Output<Tensor>("Out");
+    Out->mutable_data<T>(context.GetPlace());
+
+    auto x = EigenMatrix<T>::From(*X);
+    auto y = EigenMatrix<T>::From(*Y);
+    auto out = EigenMatrix<T>::From(*Out);
+    out.setZero();
+
+    size_t batch_size = X->dims()[0];
+    size_t x_width = X->dims()[1];
+    size_t y_width = Y->dims()[1];
+    size_t y_half_width = (y_width - 1) / 2;
+
+    for (size_t k = 0; k < batch_size; ++k) {
+      for (size_t i = 0; i < x_width; ++i) {
+        for (size_t j = 0; j < y_width; ++j) {
+          int index = (i + j - y_half_width + x_width) % x_width;
+          out(k, i) += x(k, index) * y(k, j);
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class ConvShiftGradKernel<platform::CPUPlace, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *X = context.Input<Tensor>("X");
+    auto *Y = context.Input<Tensor>("Y");
+    auto *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto *dX = context.Output<Tensor>(framework::GradVarName("X"));
+    auto *dY = context.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto x = EigenMatrix<T>::From(*X);
+    auto y = EigenMatrix<T>::From(*Y);
+    auto dout = EigenMatrix<T>::From(*dOut);
+
+    auto x_dims = X->dims();
+    auto y_dims = Y->dims();
+    size_t batch_size = x_dims[0];
+    size_t x_width = x_dims[1];
+    size_t y_width = y_dims[1];
+    size_t y_half_width = (y_width - 1) / 2;
+
+    // The below trades code duplication for efficiency (keeping the if
+    // statement outside of the loop).
+    if (dX) {
+      dX->mutable_data<T>(context.GetPlace());
+      auto dx = EigenMatrix<T>::From(*dX);
+      dx.setZero();
+      for (size_t k = 0; k < batch_size; ++k) {
+        for (size_t i = 0; i < x_width; ++i) {
+          for (size_t j = 0; j < y_width; ++j) {
+            int index = (i + j - y_half_width + x_width) % x_width;
+            dx(k, index) += dout(k, i) * y(k, j);
+          }
+        }
+      }
+    }
+
+    if (dY) {
+      dY->mutable_data<T>(context.GetPlace());
+      auto dy = EigenMatrix<T>::From(*dY);
+      dy.setZero();
+      for (size_t k = 0; k < batch_size; ++k) {
+        for (size_t i = 0; i < x_width; ++i) {
+          for (size_t j = 0; j < y_width; ++j) {
+            int index = (i + j - y_half_width + x_width) % x_width;
+            dy(k, j) += x(k, index) * dout(k, i);
+          }
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker,
+            conv_shift_grad, ops::ConvShiftGradOp);
+REGISTER_OP_CPU_KERNEL(conv_shift,
+                       ops::ConvShiftKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    conv_shift_grad,
+    ops::ConvShiftGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9818707ce3b98afe25050336d85e3b05919620f3
--- /dev/null
+++ b/paddle/fluid/operators/conv_shift_op.cu
@@ -0,0 +1,197 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/conv_shift_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+namespace {
+
+inline int DivUp(int x, int y) { return (x + y - 1) / y; }
+
+// Some notes on the design:
+//
+// Each thread is responsible for computing a single output out[k, i].
+// Thread blocks are based on tiles of x with height 1 in the batch dimension.
+//
+// This design is based on the typical use case where the filter
+// y is fairly small. For large y, it would probably be more efficient
+// to also tile across y.
+template <typename T>
+__global__ void ConvShiftForward(const T *x, const T *y, int x_width,
+                                 int y_width, int y_half_width, int batch_size,
+                                 T *out) {
+  extern __shared__ T mem[];
+
+  int tx = threadIdx.x;
+  int i = blockIdx.x * blockDim.x + tx;  // global x index
+  int k = blockIdx.y;                    // batch index
+
+  // Check if we are in a boundary block with fewer x's to process than
+  // blockDim.x.
+  int num_x =
+      (blockIdx.x == gridDim.x - 1) ? (x_width % blockDim.x) : blockDim.x;
+
+  T *sx = mem;
+  T *sx_pad = &mem[num_x];
+  T *sy = &mem[blockDim.x + y_width];
+
+  // Collaboratively load y[k, :] and length-y padding of x into shared memory.
+  int pad_start = blockIdx.x * blockDim.x + num_x + x_width - y_half_width;
+  for (int j = tx; j < y_width; j += blockDim.x) {
+    sy[j] = y[k * y_width + j];
+    sx_pad[j] = x[k * x_width + (pad_start + j) % x_width];
+  }
+
+  // Load a cyclically shifted slice of x into shared memory.
+  if (tx < num_x) {
+    int load_i = (i - y_half_width + x_width) % x_width;
+    sx[tx] = x[k * x_width + load_i];
+  }
+  __syncthreads();
+
+  if (tx < num_x) {
+    // Compute dot product of sx[tx:tx + y_width] and sy.
+    T sum = 0;
+    for (int j = 0; j < y_width; ++j) {
+      sum += sx[tx + j] * sy[j];
+    }
+
+    // Save to out[k, i].
+    out[k * x_width + i] = sum;
+  }
+}
+
+// Compute x gradient - initial naive implementation with atomic add.
+template <typename T>
+__global__ void ConvShiftGradX(const T *dout, const T *y, int x_width,
+                               int y_width, int y_half_width, int batch_size,
+                               T *dx) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // x index
+  int j = blockIdx.y;                             // y index
+  int k = blockIdx.z;                             // batch index
+
+  if (i < x_width) {
+    int index = (i + j - y_half_width + x_width) % x_width;
+    atomicAdd(&dx[k * x_width + index],
+              dout[k * x_width + i] * y[k * y_width + j]);
+  }
+}
+
+// Compute y gradient - initial naive implementation with atomic add.
+template <typename T>
+__global__ void ConvShiftDy(const T *x, const T *dout, int x_width, int y_width,
+                            int y_half_width, int batch_size, T *dy) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // x index
+  int j = blockIdx.y;                             // y index
+  int k = blockIdx.z;                             // batch index
+
+  if (i < x_width) {
+    int index = (i + j - y_half_width + x_width) % x_width;
+    atomicAdd(&dy[k * y_width + j],
+              x[k * x_width + index] * dout[k * x_width + i]);
+  }
+}
+}  // namespace
+
+template <typename T>
+class ConvShiftKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Y = context.Input<Tensor>("Y");
+    Tensor *Out = context.Output<Tensor>("Out");
+    const T *x_data = X->data<T>();
+    const T *y_data = Y->data<T>();
+    T *out_data = Out->mutable_data<T>(context.GetPlace());
+
+    int batch_size = X->dims()[0];
+    int x_width = X->dims()[1];
+    int y_width = Y->dims()[1];
+    int y_half_width = (y_width - 1) / 2;
+
+    const int x_per_block = 256;
+    int num_x_blocks = DivUp(x_width, x_per_block);
+    int mem_per_block = (x_per_block + 2 * y_width) * sizeof(T);
+
+    dim3 grid_dim(num_x_blocks, batch_size);
+
+    auto stream =
+        context.template device_context<platform::CUDADeviceContext>().stream();
+
+    ConvShiftForward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>(
+        x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data);
+  }
+};
+
+template <typename T>
+class ConvShiftGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Y = context.Input<Tensor>("Y");
+    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    const T *x_data = X->data<T>();
+    const T *y_data = Y->data<T>();
+    const T *dout_data = dOut->data<T>();
+
+    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
+    Tensor *dY = context.Output<Tensor>(framework::GradVarName("Y"));
+
+    int batch_size = X->dims()[0];
+    int x_width = X->dims()[1];
+    int y_width = Y->dims()[1];
+    int y_half_width = (y_width - 1) / 2;
+
+    auto &device_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    math::SetConstant<platform::CUDADeviceContext, T> zero;
+
+    const int x_per_block = 256;
+    int num_x_blocks = DivUp(x_width, x_per_block);
+    dim3 grid_dim(num_x_blocks, y_width, batch_size);
+
+    if (dX) {
+      T *dx_data = dX->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, dX, static_cast<T>(0.0));
+      ConvShiftGradX<T><<<grid_dim, x_per_block, 0, device_ctx.stream()>>>(
+          dout_data, y_data, x_width, y_width, y_half_width, batch_size,
+          dx_data);
+    }
+    if (dY) {
+      T *dy_data = dY->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, dY, static_cast<T>(0.0));
+      ConvShiftDy<T><<<grid_dim, x_per_block, 0, device_ctx.stream()>>>(
+          x_data, dout_data, x_width, y_width, y_half_width, batch_size,
+          dy_data);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    conv_shift,
+    ops::ConvShiftKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    conv_shift_grad,
+    ops::ConvShiftGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/conv_shift_op.h b/paddle/fluid/operators/conv_shift_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..987a690895e2a7428f058eb2d8366f9c7572912b
--- /dev/null
+++ b/paddle/fluid/operators/conv_shift_op.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ConvShiftKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override;
+};
+
+template <typename DeviceContext, typename T>
+class ConvShiftGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override;
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0aed4ebeffa7312c218bb892fbcdf9cd9cdc53ca
--- /dev/null
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
@@ -0,0 +1,251 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/conv_transpose_op.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
+using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
+using DataLayout = platform::DataLayout;
+
+static constexpr size_t kConvCUDNNWorkspaceLimitBytes = 1024 * 1024 * 1024;
+
+template <typename T>
+class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* output = ctx.Output<Tensor>("Output");
+
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    // cudnn v5 does not support dilations
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
+
+    const T* input_data = input->data<T>();
+    const T* filter_data = filter->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedFilterDescriptor filter_desc;
+    ScopedConvolutionDescriptor conv_desc;
+    DataLayout layout;
+
+    if (strides.size() == 2U) {
+      layout = DataLayout::kNCHW;
+    } else {
+      layout = DataLayout::kNCDHW;
+    }
+
+    // (N, M, H, W) or (N, M, D, H, W)
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()));
+    // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
+    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+        layout, framework::vectorize2int(output->dims()));
+    // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w)
+    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
+        layout, framework::vectorize2int(filter->dims()));
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+
+    // ------------------- cudnn conv workspace ---------------------
+    void* cudnn_workspace = nullptr;
+    size_t workspace_size_in_bytes;  // final workspace to allocate.
+    size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
+    if (user_workspace_size > 0) {
+      workspace_size_limit = user_workspace_size * 1024 * 1024;
+    }
+    // ------------------- cudnn conv algorithm ---------------------
+    cudnnConvolutionBwdDataAlgo_t algo;
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    // Get the algorithm
+    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+        handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
+        // dxDesc: Handle to the previously initialized output tensor
+        // descriptor.
+        cudnn_output_desc, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+        workspace_size_limit, &algo));
+
+    // get workspace size able to allocate
+    PADDLE_ENFORCE(
+        platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+            handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
+            cudnn_output_desc, algo, &workspace_size_in_bytes));
+
+    // Allocate on GPU memory
+    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
+
+    // ------------------- cudnn conv transpose forward ---------------------
+    T alpha = 1.0f, beta = 0.0f;
+    PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+        handle, &alpha, cudnn_filter_desc, filter_data, cudnn_input_desc,
+        input_data, cudnn_conv_desc, algo, cudnn_workspace,
+        workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
+
+    // Release the cudnn workspace
+    paddle::memory::Free(gpu, cudnn_workspace);
+  }
+};
+
+template <typename T>
+class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto input = ctx.Input<Tensor>("Input");
+    auto filter = ctx.Input<Tensor>("Filter");
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+    const T* input_data = input->data<T>();
+    const T* output_grad_data = output_grad->data<T>();
+    const T* filter_data = filter->data<T>();
+
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    // cudnn v5 does not support dilations
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
+
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedFilterDescriptor filter_desc;
+    ScopedConvolutionDescriptor conv_desc;
+    DataLayout layout = DataLayout::kNCHW;
+
+    // Input: (N, M, H, W) or (N, M, D, H, W)
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()));
+    // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
+    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+        layout, framework::vectorize2int(output_grad->dims()));
+    // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w)
+    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
+        layout, framework::vectorize2int(filter->dims()));
+
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+
+    // ------------------- cudnn backward algorithm ---------------------
+    cudnnConvolutionFwdAlgo_t data_algo;
+    cudnnConvolutionBwdFilterAlgo_t filter_algo;
+    size_t bwd_filter_ws_size, fwd_ws_size;
+    size_t workspace_size_in_bytes = 0;
+    size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
+    if (user_workspace_size > 0) {
+      workspace_size_limit = user_workspace_size * 1024 * 1024;
+    }
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    if (input_grad) {
+      // choose backward algorithm for data
+      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+          handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
+          cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+          workspace_size_limit, &data_algo));
+      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+          handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
+          cudnn_input_desc, data_algo, &fwd_ws_size));
+      workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size);
+    }
+
+    if (filter_grad) {
+      // choose backward algorithm for filter
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+              handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
+              cudnn_filter_desc,
+              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+              workspace_size_limit, &filter_algo));
+
+      // get workspace for backwards filter algorithm
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+              handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
+              cudnn_filter_desc, filter_algo, &bwd_filter_ws_size));
+      workspace_size_in_bytes =
+          std::max(workspace_size_in_bytes, bwd_filter_ws_size);
+    }
+
+    // ------------------- cudnn conv workspace ---------------------
+    // Already on GPU
+    void* cudnn_workspace = nullptr;
+    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
+    // ------------------- cudnn conv backward data ---------------------
+    // FIXME(typhoonzero): template type T may not be the same as cudnn call.
+    T alpha = 1.0f, beta = 0.0f;
+    if (input_grad) {
+      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+      // Because beta is zero, it is unnecessary to reset input_grad.
+      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
+          handle, &alpha, cudnn_output_desc, output_grad_data,
+          cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo,
+          cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
+          input_grad_data));
+    }
+
+    // ------------------- cudnn conv backward filter ---------------------
+    if (filter_grad) {
+      T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
+      // Because beta is zero, it is unnecessary to reset filter_grad.
+      // Gradient with respect to the filter
+      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+          handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
+          input_data, cudnn_conv_desc, filter_algo, cudnn_workspace,
+          workspace_size_in_bytes, &beta, cudnn_filter_desc, filter_grad_data));
+    }
+    // Release the cudnn workspace
+    paddle::memory::Free(gpu, cudnn_workspace);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::CUDNNConvTransposeOpKernel<float>,
+                   ops::CUDNNConvTransposeOpKernel<double>);
+REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::CUDNNConvTransposeGradOpKernel<float>,
+                   ops::CUDNNConvTransposeGradOpKernel<double>);
+
+REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::CUDNNConvTransposeOpKernel<float>,
+                   ops::CUDNNConvTransposeOpKernel<double>);
+REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::CUDNNConvTransposeGradOpKernel<float>,
+                   ops::CUDNNConvTransposeGradOpKernel<double>);
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..974cffad92871c1a855c86a7a2e56f8e65819428
--- /dev/null
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -0,0 +1,323 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/conv_transpose_op.h"
+
+namespace paddle {
+namespace operators {
+
+void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("Input"),
+                 "Input(Input) of ConvTransposeOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                 "Input(Filter) of ConvTransposeOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                 "Output(Output) of ConvTransposeOp should not be null.");
+
+  auto in_dims = ctx->GetInputDim("Input");
+  auto filter_dims = ctx->GetInputDim("Filter");
+  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+  std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
+
+  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
+                 "ConvTransposeOp intput should be 4-D or 5-D tensor.");
+  PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(),
+                    "ConvTransposeOp input dimension and filter dimension "
+                    "should be the same.");
+  PADDLE_ENFORCE(in_dims.size() - strides.size() == 2U,
+                 "ConvTransposeOp input dimension and strides dimension should "
+                 "be consistent.");
+  PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
+                    "ConvTransposeOp paddings dimension and strides "
+                    "dimension should be the same.");
+  PADDLE_ENFORCE_EQ(paddings.size(), dilations.size(),
+                    "ConvTransposeOp paddings dimension and dilations "
+                    "dimension should be the same.");
+  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
+                    "In ConvTransposeOp, The input channel should be the same "
+                    "as the number of filters.");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
+    output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] +
+                           filter_extent);
+  }
+  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
+}
+
+framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
+  framework::LibraryType library_;
+  if (use_cudnn) {
+    library_ = framework::LibraryType::kCUDNN;
+  } else {
+    library_ = framework::LibraryType::kPlain;
+  }
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+
+Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(OpProto* proto,
+                                               OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "Input",
+      "(Tensor) The input tensor of convolution transpose operator. "
+      "The format of input tensor is NCHW. Where N is batch size, C is the "
+      "number of input channels, H is the height of the feature, and "
+      "W is the width of the feature.");
+  AddInput(
+      "Filter",
+      "(Tensor) The filter tensor of convolution transpose operator. "
+      "The format of the filter tensor is MCHW, where M is the number of "
+      "input feature channels, C is the number of "
+      "output feature channels,"
+      "H is the height of the filter, and W is the width of the filter. "
+      "We enforce groups number == 1 in the convolution transpose scenario.");
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution transpose operator. "
+            "The format of output tensor is also NCHW.");
+
+  AddAttr<std::vector<int>>("dilations",
+                            "(vector<int> default:{1, 1}), the "
+                            "dilations(h_dilation, w_dilation) of convolution "
+                            "transpose operator.")
+      .SetDefault({1, 1});
+  AddAttr<std::vector<int>>(
+      "strides",
+      "(vector<int> default:{1, 1}), the strides(h_stride, w_stride) of "
+      "convolution transpose operator.")
+      .SetDefault({1, 1});
+  AddAttr<std::vector<int>>(
+      "paddings",
+      "(vector<int> default:{0, 0}), the paddings(h_pad, w_pad) of convolution "
+      "transpose operator.")
+      .SetDefault({0, 0});
+  AddAttr<bool>(
+      "use_cudnn",
+      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      .SetDefault(false);
+  AddAttr<std::string>(
+      "data_format",
+      "(string, default NCHW) Only used in "
+      "An optional string from: \"NHWC\", \"NCHW\". "
+      "Defaults to \"NHWC\". Specify the data format of the output data, "
+      "the input will be transformed automatically. ")
+      .SetDefault("AnyLayout");
+  // TODO(dzhwinter): need to registered layout transform function
+  AddAttr<int>("workspace_size_MB",
+               "Used in cudnn kernel only. workspace size for cudnn, in MB, "
+               "workspace is a section of GPU memory which will be "
+               "allocated/freed each time the operator runs, larger "
+               "workspace size can increase performance but also requires "
+               "better hardward. This size should be carefully setted.")
+      .SetDefault(4096);
+  AddComment(R"DOC(
+Convolution2D Transpose Operator.
+
+The convolution transpose operation calculates the output based on the input, filter
+and dilations, strides, paddings, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the
+number of channels, H is the height of the feature, and W is the width of the feature.
+Filter(Input) is in MCHW format. Where M is the number of input feature channels,
+C is the number of output feature channels, H is the height of the filter,
+and W is the width of the filter.
+Parameters(strides, paddings) are two elements. These two elements represent height
+and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       Input shape: $(N, C_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
+  Output:
+       Output shape: $(N, C_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       H_{out} = (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\
+       W_{out} = (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1
+  $$
+)DOC");
+}
+
+Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(OpProto* proto,
+                                               OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput("Input",
+           "(Tensor) The input tensor of convolution transpose operator."
+           "The format of input tensor is NCDHW. Where N is batch size, C is "
+           "the number of channels, D is the depth of the feature, H is the "
+           "height of the feature, and "
+           "W is the width of the feature.");
+  AddInput("Filter",
+           "(Tensor) The filter tensor of convolution transpose operator."
+           "The format of the filter tensor is MCDHW, where M is the number of "
+           "input feature channels, C is the number of "
+           "output feature channels, D "
+           "is the depth of the filter, H is the height of the filter, and "
+           "W is the width of the filter."
+           "We enforce groups number == 1 and padding == 0 in "
+           "the convolution3d transpose scenario.");
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution transpose operator."
+            "The format of output tensor is also NCDHW."
+            "Where N is batch size, C is "
+            "the number of channels, D is the depth of the feature, H is the "
+            "height of the feature, and W is the width of the feature.");
+
+  AddAttr<std::vector<int>>(
+      "dilations",
+      "(vector<int> default:{1, 1, 1}), the "
+      "dilations(d_dilation,h_dilation, w_dilation) of convolution "
+      "transpose operator.")
+      .SetDefault({1, 1, 1});
+  AddAttr<std::vector<int>>("strides",
+                            "(vector<int> default:{1, 1, 1}), the "
+                            "strides{d_stride, h_stride, w_stride} of "
+                            "convolution transpose operator.")
+      .SetDefault({1, 1, 1});
+  AddAttr<std::vector<int>>("paddings",
+                            "(vector<int> default:{0, 0, 0}), paddings(d_pad, "
+                            "h_pad, w_pad) of convolution transpose operator.")
+      .SetDefault({0, 0, 0});
+  AddAttr<bool>(
+      "use_cudnn",
+      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      .SetDefault(false);
+  AddAttr<std::string>(
+      "data_format",
+      "(string, default NCHW) Only used in "
+      "An optional string from: \"NHWC\", \"NCHW\". "
+      "Defaults to \"NHWC\". Specify the data format of the output data, "
+      "the input will be transformed automatically. ")
+      .SetDefault("AnyLayout");
+  // TODO(dzhwinter): need to registered layout transform function
+  AddAttr<int>("workspace_size_MB",
+               "Used in cudnn kernel only. workspace size for cudnn, in MB, "
+               "workspace is a section of GPU memory which will be "
+               "allocated/freed each time the operator runs, larger "
+               "workspace size can increase performance but also requires "
+               "better hardward. This size should be carefully setted.")
+      .SetDefault(4096);
+  AddComment(R"DOC(
+Convolution3D Transpose Operator.
+
+The convolution transpose operation calculates the output based on the input, filter
+and dilations, strides, paddings, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the
+number of channels, D is the depth of the feature, H is the height of the feature,
+and W is the width of the feature.
+Filter(Input) is in MCDHW format. Where M is the number of input feature channels,
+C is the number of output feature channels, D is the depth of the filter,H is the
+height of the filter, and W is the width of the filter.
+Parameters(strides, paddings) are three elements. These three elements represent
+depth, height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:   
+  Input:
+       Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$
+  Output:
+       Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       D_{out} = (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\
+       H_{out} = (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\
+       W_{out} = (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1
+  $$
+)DOC");
+}
+
+void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const {
+  auto in_dims = ctx->GetInputDim("Input");
+  auto filter_dims = ctx->GetInputDim("Filter");
+  if (ctx->HasOutput(framework::GradVarName("Input"))) {
+    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
+  }
+  if (ctx->HasOutput(framework::GradVarName("Filter"))) {
+    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
+  }
+}
+
+framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
+  framework::LibraryType library_;
+  if (use_cudnn) {
+    library_ = framework::LibraryType::kCUDNN;
+  } else {
+    library_ = framework::LibraryType::kPlain;
+  }
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
+            conv2d_transpose_grad, ops::ConvTransposeOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    conv2d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv2d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
+
+REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
+            conv3d_transpose_grad, ops::ConvTransposeOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    conv3d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv3d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
diff --git a/paddle/fluid/operators/conv_transpose_op.cu.cc b/paddle/fluid/operators/conv_transpose_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ed90c6ec6265cc914a172b6c7217a204981e7fd1
--- /dev/null
+++ b/paddle/fluid/operators/conv_transpose_op.cu.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/conv_transpose_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    conv2d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv2d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                                     float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                                     double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    conv3d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv3d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                                     float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                                     double>);
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f512575468626edfb3e36c007e26b05faff0a06d
--- /dev/null
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -0,0 +1,291 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+// Define Op classes in .h file so that other conv transpose
+// operator implementations can reuse the code.
+class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv2DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+};
+
+class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv3DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+};
+
+class ConvTransposeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class ConvTransposeOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+template <typename DeviceContext, typename T>
+class GemmConvTransposeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    // The filter will be reshaped, so it should not be constant pointer
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    // groups will alway be disabled in conv2dtranspose.
+
+    const int batch_size = static_cast<int>(input->dims()[0]);
+
+    // input_shape_vec: {n, c, h, w} or {n, c, d, h, w}
+    std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
+    // filter_shape_vec: {k_o, k_c, k_h, k_w} or {k_o, k_c, k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
+
+    // use col_shape in the im2col and col2im (or vol2col and col2vol)
+    // calculation
+    // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = output->dims()[1];
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
+    }
+    DDim col_shape(framework::make_ddim(col_shape_vec));
+
+    // use col_matrix_shape in the gemm calculation
+    // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
+    DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1);
+
+    Tensor col;
+    col.mutable_data<T>(col_shape, context.GetPlace());
+    // col_matrix shares the same piece of data with col,
+    // but will be reshaped into a two-dimensional matrix shape
+    // to call the matrix multiplication interface.
+    Tensor col_matrix;
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+
+    // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
+    DDim output_shape =
+        framework::slice_ddim(output->dims(), 1, output->dims().size());
+
+    // input matrix size: (m, h * w) or (m, d * h * w)
+    DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
+
+    // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
+    DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
+    filter.Resize(filter_matrix_shape);
+
+    output->mutable_data<T>(context.GetPlace());
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    set_zero(dev_ctx, output, static_cast<T>(0));
+
+    math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
+    math::Col2VolFunctor<DeviceContext, T> col2vol;
+
+    // convolution transpose: gemm + col2im or col2vol (similar to conv-backward
+    // on input)
+    for (int i = 0; i < batch_size; i++) {
+      // batch with size (m, h * w) or (m, d * h * w)
+      Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
+
+      // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
+      Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
+
+      // col_matrix = filter * input_batch
+      // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
+      math::matmul<DeviceContext, T>(dev_ctx, filter, true, input_batch, false,
+                                     static_cast<T>(1.0), &col_matrix,
+                                     static_cast<T>(0.0));
+
+      if (data_dim == 2U) {
+        // col2im: col_matrix -> dy
+        // from (c * k_h * k_w, h * w) to (c, o_h, o_w)
+        col2im(dev_ctx, col, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &output_batch);
+      } else if (data_dim == 3U) {
+        // col2vol: col_matrix -> dy
+        // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
+        col2vol(dev_ctx, col, dilations, strides, paddings, &output_batch);
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    // For filter, we do not use const pointer b/c we will do reshape,
+    // but we should avoid modifying its value.
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+
+    if ((!input_grad) && (!filter_grad)) return;
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+
+    const int batch_size = static_cast<int>(input->dims()[0]);
+
+    // input_shape_vec: {n, c, h, w} or {n, c, d, h, w}
+    std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
+    // filter_shape_vec: {k_o, k_c, k_h, k_w} or {k_o, k_c, k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
+
+    // use col_shape in the im2col and col2im (or vol2col and col2vol)
+    // calculation
+    // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = output_grad->dims()[1];
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
+    }
+    DDim col_shape(framework::make_ddim(col_shape_vec));
+
+    // use col_matrix_shape in the gemm calculation
+    // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
+    DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1);
+
+    // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
+    DDim output_shape = framework::slice_ddim(output_grad->dims(), 1,
+                                              output_grad->dims().size());
+
+    // input matrix size: (m, h * w) or (m, d * h * w)
+    DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
+
+    // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
+    DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
+    filter.Resize(filter_matrix_shape);
+
+    // convolution transpose grad on input:
+    // im2col + gemm (similar to conv-forward)
+    // input need to compute gradient
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    if (input_grad || filter_grad) {
+      Tensor col;
+      col.mutable_data<T>(col_shape, context.GetPlace());
+      // col_matrix shares the same piece of data with col,
+      // but will be reshaped into a two-dimensional matrix shape
+      // to call the matrix multiplication interface.
+      Tensor col_matrix;
+      col_matrix.ShareDataWith(col);
+      col_matrix.Resize(col_matrix_shape);
+
+      Tensor filter_grad_;
+      math::SetConstant<DeviceContext, T> set_zero;
+
+      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
+      math::Vol2ColFunctor<DeviceContext, T> vol2col;
+
+      if (input_grad) {
+        input_grad->mutable_data<T>(context.GetPlace());
+      }
+      if (filter_grad) {  // filter size (m, c, k_h, k_w)
+        filter_grad->mutable_data<T>(context.GetPlace());
+        set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+        filter_grad_ = *filter_grad;
+        filter_grad_.Resize(filter_matrix_shape);
+      }
+
+      for (int i = 0; i < batch_size; i++) {
+        // batch with size (c, o_h * o_w)
+        Tensor output_grad_batch =
+            output_grad->Slice(i, i + 1).Resize(output_shape);
+
+        if (data_dim == 2U) {
+          // im2col: dy -> col matrix
+          // from (c, o_h, o_w) to (c * k_h * k_w, h * w)
+          im2col(dev_ctx, output_grad_batch, dilations, strides,
+                 std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                  paddings[1]},
+                 &col);
+        } else if (data_dim == 3U) {
+          // vol2col: dy -> col_matrix
+          // from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w)
+          vol2col(dev_ctx, output_grad_batch, dilations, strides, paddings,
+                  &col);
+        }
+
+        if (input_grad) {
+          // batch with size (m, h, w)
+          Tensor input_grad_batch =
+              input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
+          // gemm: dx = filter * dy
+          // (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, h * w)
+          // or
+          // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
+          // d, h, w)
+          math::matmul<DeviceContext, T>(
+              dev_ctx, filter, false, col_matrix, false, static_cast<T>(1.0),
+              &input_grad_batch, static_cast<T>(0.0));
+        }
+        if (filter_grad) {
+          // input batch
+          Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
+          // gemm: d_filter = x * dy^T
+          // (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, k_h * k_w)
+          // or
+          // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
+          // k_h * k_w)
+          math::matmul<DeviceContext, T>(dev_ctx, in_batch, false, col_matrix,
+                                         true, static_cast<T>(1.0),
+                                         &filter_grad_, static_cast<T>(1.0));
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..57c5a6025a03fbafadb56a3dbec9c4cfab5e979a
--- /dev/null
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -0,0 +1,162 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/cos_sim_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class CosSimOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    // notnull check
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of CosSimOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of CosSimOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of CosSimOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("XNorm"),
+                   "Output(XNorm) of CosSimOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("YNorm"),
+                   "Output(YNorm) of CosSimOp should not be null.");
+
+    // shape check
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
+                      "Ranks of Input(X) and Input(Y) must be equal.");
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "Rank of Input(X) must not be less than 2.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 1, x_dims.size()),
+                      framework::slice_ddim(y_dims, 1, y_dims.size()),
+                      "All dimensions except the 1st of Input(X) and Input(Y) "
+                      "must be equal.");
+    PADDLE_ENFORCE(x_dims[0] == y_dims[0] || y_dims[0] == 1,
+                   "The 1st dimension of Input(Y) must be equal to Input(X) or"
+                   " just 1 (which will be broadcasted to match Input(X)).");
+
+    // resize tensor
+    ctx->SetOutputDim("Out", {x_dims[0], 1});
+    ctx->SetOutputDim("XNorm", {x_dims[0], 1});
+    ctx->SetOutputDim("YNorm", {y_dims[0], 1});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CosSimOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The 1st input of cos_sim op.");
+    AddInput("Y", "The 2nd input of cos_sim op.");
+    AddOutput("Out", "The output of cos_sim op.");
+    AddOutput("XNorm",
+              "Norm of the first input, reduced along the 1st "
+              "dimension.")
+        .AsIntermediate();
+    AddOutput("YNorm",
+              "Norm of the second input, reduced along the 1st "
+              "dimension.")
+        .AsIntermediate();
+
+    AddComment(R"DOC(
+Cosine Similarity Operator.
+
+$Out = X^T * Y / (\sqrt{X^T * X} * \sqrt{Y^T * Y})$
+
+The input X and Y must have the same shape, except that the 1st dimension
+of input Y could be just 1 (different from input X), which will be
+broadcasted to match the shape of input X before computing their cosine
+similarity.
+
+Both the input X and Y can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD information with input X.
+
+)DOC");
+  }
+};
+
+class CosSimOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    // notnull check
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("XNorm"), "Input(XNorm) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("YNorm"), "Input(YNorm) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) must not be null.");
+
+    // shape check
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto xnorm_dims = ctx->GetInputDim("XNorm");
+    auto ynorm_dims = ctx->GetInputDim("YNorm");
+    auto out_dims = ctx->GetInputDim("Out");
+    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                      "Ranks of Input(X) and Input(Y) must be equal.");
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "Rank of Input(X) must not be less than 2.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 1, x_dims.size()),
+                      framework::slice_ddim(y_dims, 1, y_dims.size()),
+                      "All dimensions except the 1st of Input(X) and Input(Y) "
+                      "must be equal.");
+    PADDLE_ENFORCE(x_dims[0] == y_dims[0] || y_dims[0] == 1,
+                   "The 1st dimension of Input(Y) must be equal to Input(X) or"
+                   " just 1 (which will be broadcasted to match Input(X)).");
+    auto target_xnorm_dims = framework::make_ddim({x_dims[0], 1});
+    auto target_ynorm_dims = framework::make_ddim({y_dims[0], 1});
+    PADDLE_ENFORCE_EQ(xnorm_dims, target_xnorm_dims,
+                      "Shape of Input(XNorm) must be [X.Dim(0), 1].");
+    PADDLE_ENFORCE_EQ(ynorm_dims, target_ynorm_dims,
+                      "Shape of Input(YNorm) must be [Y.Dim(0), 1].");
+    PADDLE_ENFORCE_EQ(out_dims, target_xnorm_dims,
+                      "Shape of Input(Out) must be [X.Dim(0), 1].");
+    PADDLE_ENFORCE_EQ(out_grad_dims, target_xnorm_dims,
+                      "Shape of Input(Out@Grad) must be [X.Dim(0), 1].");
+
+    // resize tensor
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad,
+            ops::CosSimOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    cos_sim, ops::CosSimKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    cos_sim_grad,
+    ops::CosSimGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/cos_sim_op.cu b/paddle/fluid/operators/cos_sim_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c8cf363cdc4009bd8fa233a52435dcc6ea56cf3c
--- /dev/null
+++ b/paddle/fluid/operators/cos_sim_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/cos_sim_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    cos_sim, ops::CosSimKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    cos_sim_grad,
+    ops::CosSimGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/cos_sim_op.h b/paddle/fluid/operators/cos_sim_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9cd8b196daf6e4afe9bde4d91db0110430cd7324
--- /dev/null
+++ b/paddle/fluid/operators/cos_sim_op.h
@@ -0,0 +1,131 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/cos_sim_functor.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class CosSimKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    // get Tensor
+    auto* in_x = context.Input<Tensor>("X");
+    auto* in_y = context.Input<Tensor>("Y");
+    auto* out_z = context.Output<Tensor>("Out");
+    auto* out_x_norm = context.Output<Tensor>("XNorm");
+    auto* out_y_norm = context.Output<Tensor>("YNorm");
+    out_z->mutable_data<T>(context.GetPlace());
+    out_x_norm->mutable_data<T>(context.GetPlace());
+    out_y_norm->mutable_data<T>(context.GetPlace());
+
+    int rows_x = in_x->dims()[0];
+    int rows_y = in_y->dims()[0];
+
+    int cols = framework::product(in_x->dims()) / rows_x;
+
+    if (rows_x == rows_y) {
+      math::CosSimFunctor<T, true> functor(
+          in_x->data<T>(), in_y->data<T>(), out_x_norm->data<T>(),
+          out_y_norm->data<T>(), out_z->data<T>(), cols);
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(context.device_context()), rows_x);
+      for_range(functor);
+    } else {
+      math::CosSimFunctor<T, false> functor(
+          in_x->data<T>(), in_y->data<T>(), out_x_norm->data<T>(),
+          out_y_norm->data<T>(), out_z->data<T>(), cols);
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(context.device_context()), rows_x);
+      for_range(functor);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CosSimGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    // get Tensor
+    auto* in_x = context.Input<Tensor>("X");
+    auto* in_y = context.Input<Tensor>("Y");
+    auto* in_z = context.Input<Tensor>("Out");
+    auto* in_x_norm = context.Input<Tensor>("XNorm");
+    auto* in_y_norm = context.Input<Tensor>("YNorm");
+    auto* out_grad_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* out_grad_y = context.Output<Tensor>(framework::GradVarName("Y"));
+    auto* in_grad_z = context.Input<Tensor>(framework::GradVarName("Out"));
+
+    // compute gradident
+    int rows_x = in_x->dims()[0];
+    int rows_y = in_y->dims()[0];
+    int cols = framework::product(in_x->dims()) / rows_x;
+
+    if (rows_x == rows_y) {
+      if (out_grad_x) {
+        math::CosSimGradFunctor<T> functor(
+            in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
+            in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
+            out_grad_x->mutable_data<T>(context.GetPlace()), cols);
+        platform::ForRange<DeviceContext> for_range(
+            static_cast<const DeviceContext&>(context.device_context()),
+            rows_x);
+        for_range(functor);
+      }
+      if (out_grad_y) {
+        math::CosSimGradFunctor<T> functor(
+            in_y_norm->data<T>(), in_x_norm->data<T>(), in_y->data<T>(),
+            in_x->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
+            out_grad_y->mutable_data<T>(context.GetPlace()), cols);
+        platform::ForRange<DeviceContext> for_range(
+            static_cast<const DeviceContext&>(context.device_context()),
+            rows_x);
+        for_range(functor);
+      }
+    } else {
+      if (out_grad_x) {
+        math::CosSimDxFunctor<T> functor(
+            in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
+            in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
+            out_grad_x->mutable_data<T>(context.GetPlace()), cols);
+        platform::ForRange<DeviceContext> for_range(
+            static_cast<const DeviceContext&>(context.device_context()),
+            rows_x);
+        for_range(functor);
+      }
+      if (out_grad_y) {
+        out_grad_y->mutable_data<T>(context.GetPlace());
+        math::SetConstant<DeviceContext, T> set_zero;
+        auto& dev_ctx = context.template device_context<DeviceContext>();
+        set_zero(dev_ctx, out_grad_y, static_cast<T>(0));
+
+        math::CosSimDyFunctor<DeviceContext, T> functor;
+        functor(dev_ctx, in_x_norm->data<T>(), in_y_norm->data<T>(),
+                in_x->data<T>(), in_y->data<T>(), in_z->data<T>(),
+                in_grad_z->data<T>(), static_cast<size_t>(rows_x),
+                static_cast<size_t>(cols), out_grad_y->data<T>());
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/create_reader_op.cc b/paddle/fluid/operators/create_reader_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d1ba51f2c0f13a1b6e4d7ccb93c912703a0b1d86
--- /dev/null
+++ b/paddle/fluid/operators/create_reader_op.cc
@@ -0,0 +1,240 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+
+namespace paddle {
+namespace operators {
+
+static std::vector<framework::DDim> RestoreShapes(
+    const std::vector<int>& shape_concat, const std::vector<int>& ranks) {
+  std::vector<framework::DDim> res;
+  int offset = 0;
+  for (int len : ranks) {
+    auto start_it = shape_concat.begin() + offset;
+    auto end_it = start_it + len;
+    res.push_back(framework::make_ddim(std::vector<int>(start_it, end_it)));
+    offset += len;
+  }
+  return res;
+}
+
+// general infershape for file readers
+class CreateFileReaderInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "The output file reader should not be null.");
+    const auto shape_concat =
+        ctx->Attrs().Get<std::vector<int>>("shape_concat");
+    const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
+    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
+    ctx->SetReaderDims("Out", shapes);
+
+    if (ctx->IsRuntime()) {
+      const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
+      PADDLE_ENFORCE_EQ(
+          lod_levels.size(), shapes.size(),
+          "The number of 'lod_levels'(%d) doesn't match the number "
+          "of 'shapes'(%d).",
+          lod_levels.size(), shapes.size());
+      framework::VarDesc* reader =
+          boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
+      reader->SetLoDLevels(lod_levels);
+    }
+  }
+};
+
+// general infershape for decorated readers
+class CreateDecoratedReaderInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"),
+                   "Input(UnderlyingReader) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "The output decorated reader should not be null.");
+    ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader"));
+
+    if (ctx->IsRuntime()) {
+      framework::VarDesc* in_reader = boost::get<framework::VarDesc*>(
+          ctx->GetInputVarPtrs("UnderlyingReader")[0]);
+      framework::VarDesc* out_reader =
+          boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
+      out_reader->SetLoDLevels(in_reader->GetLoDLevels());
+    }
+  }
+};
+
+// general var type inference for file readers
+class CreateFileReaderInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    std::string reader_name = op_desc.Output("Out")[0];
+    framework::VarDesc* reader = block->FindVarRecursive(reader_name);
+    reader->SetType(framework::proto::VarDesc::READER);
+  }
+};
+
+// general var type inference for decorated readers
+class CreateDecoratedReaderInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];
+    framework::VarDesc* in_reader = block->FindVarRecursive(in_reader_name);
+    std::string out_reader_name = op_desc.Output("Out")[0];
+    framework::VarDesc* out_reader = block->FindVarRecursive(out_reader_name);
+    out_reader->SetType(framework::proto::VarDesc::READER);
+    out_reader->SetDataTypes(in_reader->GetDataTypes());
+  }
+};
+
+template <typename T>
+class CreateRandomDataGeneratorOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
+    const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
+    const auto& ranks = Attr<std::vector<int>>("ranks");
+    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
+    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
+                      int(shape_concat.size()),
+                      "The accumulate of all ranks should be equal to the "
+                      "shape concat's length.");
+    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new framework::RandomDataGenerator<T>(shapes, Attr<float>("min"),
+                                                     Attr<float>("max")));
+  }
+};
+
+class CreateRandomDataGeneratorOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(op_proto, op_checker) {
+    AddOutput("Out", "(ReaderHolder) The created random reader.");
+    AddAttr<std::vector<int>>("shape_concat",
+                              "The concat of all data's shapes.");
+    AddAttr<std::vector<int>>(
+        "ranks",
+        "The ranks of each data."
+        "e.g."
+        "shape_concat = [2,3,4,5,6]"
+        "ranks = [3,2]"
+        "It means the reader will generate two data each time,"
+        "whose shapes are [2,3,4] and [5,6] respectively.");
+    AddAttr<std::vector<int>>("lod_levels", "The LoD levels of each data.");
+    AddAttr<float>("min", "The lower bound of reader's uniform distribution.");
+    AddAttr<float>("max", "The upper bound of reader's uniform distribution.");
+    AddComment(R"DOC(
+      CreateRandomDataGenerator Operator
+
+      This Op creates a random reader. 
+      The reader generates random data instead of really reading from files.
+      Generated data follow an uniform distribution between 'min' and 'max'.
+    )DOC");
+  }
+};
+
+class CreateShuffleReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new framework::ShuffleReader(underlying_reader.Get(),
+                                            Attr<int>("buffer_size")));
+  }
+};
+
+class CreateShuffleReaderOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(op_proto, op_checker) {
+    AddInput(
+        "UnderlyingReader",
+        "(ReaderHolder) The underlying reader for creating a shuffle reader.");
+    AddOutput("Out", "(ReaderHolder) The created shuffle reader.");
+    AddAttr<int>("buffer_size", "The shuffle buffer size.").GreaterThan(0);
+    AddComment(R"DOC(
+      CreateShuffleReader Operator
+
+      A shuffle reader takes another reader as its 'underlying reader'
+      and yields the underlying reader's outputs in a shuffled order. 
+    )DOC");
+  }
+};
+
+class CreateBatchReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new framework::BatchReader(underlying_reader.Get(),
+                                          Attr<int>("batch_size")));
+  }
+};
+
+class CreateBatchReaderOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(op_proto, op_checker) {
+    AddInput(
+        "UnderlyingReader",
+        "(ReaderHolder) The underlying reader for creating a batch reader.");
+    AddOutput("Out", "(ReaderHolder) The created batch reader.");
+    AddAttr<int>("batch_size",
+                 "How many instances the batch reader yields each time.")
+        .GreaterThan(0);
+    AddComment(R"DOC(
+      CreateBatchReader Operator
+
+      A batch reader takes another reader as its 'underlying reader', 
+      gathers the underlying reader's outputs and then yields them in batches. 
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(create_random_data_generator,
+                  ops::CreateRandomDataGeneratorOp<float>,
+                  ops::CreateFileReaderInferShape,
+                  ops::CreateRandomDataGeneratorOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::CreateFileReaderInferVarType);
+REGISTER_OPERATOR(create_shuffle_reader, ops::CreateShuffleReaderOp,
+                  ops::CreateDecoratedReaderInferShape,
+                  ops::CreateShuffleReaderOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::CreateDecoratedReaderInferVarType);
+REGISTER_OPERATOR(create_batch_reader, ops::CreateBatchReaderOp,
+                  ops::CreateDecoratedReaderInferShape,
+                  ops::CreateBatchReaderOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::CreateDecoratedReaderInferVarType);
diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e3c1fc95a3be574635ab8a99aa29b71bd8dbc71e
--- /dev/null
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -0,0 +1,139 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/crf_decoding_op.h"
+
+namespace paddle {
+namespace operators {
+class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CRFDecodingOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Emission",
+             "(LoDTensor, default: LoDTensor<float>). A LoDTensor with shape "
+             "[N x D] where N is the size of the mini-batch and D is the total "
+             "tag number. This input is the unscaled emission weight matrix of "
+             "the linear_chain_crf operator.");
+    AddInput(
+        "Transition",
+        "(Tensor, default: Tensor<float>). A Tensor with shape [(D + 2) x D]. "
+        "This input is the transition weights learned by the linear_chain_crf "
+        "operator, denoted as w. The 1st row of w are transition weights for "
+        "the start mask. The 2nd row of w are transition weights for the end "
+        "mask. Transition weights between other tags begin from the 3rd row of "
+        "w. See more details in comments of the linear_chain_crf operator.");
+    AddInput(
+        "Label",
+        "(LoDTensor,  LoDTensor<int64_t>). The ground truth with shape "
+        "[N x 1]. This input is optional. See more details in the operator's "
+        "comments.")
+        .AsDispensable();
+    AddOutput(
+        "ViterbiPath",
+        "(LoDTensor, LoDTensor<int64_t>). The decoding results. What to "
+        "return changes depending on whether the Input(Label) (the ground "
+        "truth) is given. See more details in the operator's comment.");
+    AddComment(R"DOC(
+The crf_decoding operator reads the emission feature weights and the transition
+feature weights learned by the linear_chain_crf operator. It implements the
+Viterbi algorithm which is a dynamic programming algorithm for finding the most
+likely sequence of hidden states, called the Viterbi path, that results in a
+sequence of observed tags.
+
+The output of this operator changes according to whether Input(Label) is given:
+
+1. Input(Label) is given:
+
+This happens in training. This operator is used to co-work with the chunk_eval
+operator.
+
+When Input(Label) is given, the crf_decoding operator returns a row vector
+with shape [N x 1] whose values are fixed to be 0, indicating an incorrect
+prediction, or 1 indicating a tag is correctly predicted. Such an output is the
+input to chunk_eval operator.
+
+2. Input(Label) is not given:
+
+This is the standard decoding process.
+
+The crf_decoding operator returns a row vector with shape [N x 1] whose values
+range from 0 to maximum tag number - 1. Each element indicates an index of a
+predicted tag.
+)DOC");
+  }
+};
+
+class CRFDecodingOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Emission"),
+                   "Input(Emission) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Transition"),
+                   "Input(Transition) should be not null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ViterbiPath"),
+                   "Output(ViterbiPath) should be not null.");
+
+    auto emission_dims = ctx->GetInputDim("Emission");
+    PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL,
+                      "The Input(Emission) should be a 2-D tensor.");
+    PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed.");
+
+    auto transition_dims = ctx->GetInputDim("Transition");
+    PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL,
+                      "The Input(Transition) should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(
+        transition_dims[0] - 2, transition_dims[1],
+        "An invalid dimension for the Input(Transition), which should "
+        "be a 2-D tensor with shape [(D + 2) x D].");
+    PADDLE_ENFORCE_EQ(
+        emission_dims[1], transition_dims[1],
+        "The 2nd dimension of the Input(Emission) and the Input(Transition) "
+        "should be equal to the tag number.");
+
+    if (ctx->HasInput("Label")) {
+      auto label_dims = ctx->GetInputDim("Label");
+      PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
+                     "The Input(Label) should be a 2-D tensor with the 2nd "
+                     "dimensions fixed to 1.");
+      PADDLE_ENFORCE_EQ(
+          emission_dims[0], label_dims[0],
+          "The height of Input(Emission) and the height of Input(Label) "
+          "should be the same.");
+    }
+
+    ctx->ShareLoD("Emission", /*->*/ "ViterbiPath");
+    ctx->SetOutputDim("ViterbiPath", {emission_dims[0], 1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
+        platform::CPUPlace());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(crf_decoding, ops::CRFDecodingOp,
+                             ops::CRFDecodingOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    crf_decoding,
+    ops::CRFDecodingOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CRFDecodingOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3c161eec5f541b9f60a36064d7e8c350078c664
--- /dev/null
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -0,0 +1,124 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::LoDTensor;
+using framework::LoD;
+using framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class CRFDecodingOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* emission_weights = ctx.Input<LoDTensor>("Emission");
+    auto* transition_weights = ctx.Input<Tensor>("Transition");
+    auto* label = ctx.Input<LoDTensor>("Label");
+    auto* decoded_path = ctx.Output<Tensor>("ViterbiPath");
+
+    PADDLE_ENFORCE_EQ(emission_weights->NumLevels(), 1UL,
+                      "The Input(Emission) should be a sequence.");
+    auto lod = emission_weights->lod();
+    PADDLE_ENFORCE(lod.size(), "Input(Emission) must be a sequence.");
+    const size_t level = 0;
+    const size_t seq_num = lod[level].size() - 1;
+
+    int64_t* path = decoded_path->mutable_data<int64_t>(platform::CPUPlace());
+    math::SetConstant<DeviceContext, int64_t>()(
+        ctx.template device_context<DeviceContext>(), decoded_path, 0);
+    for (size_t i = 0; i < seq_num; ++i) {
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+      Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos);
+      Decode(emission_weights->Slice(start_pos, end_pos), *transition_weights,
+             &decoded_path_one_seq);
+    }
+
+    if (label) {
+      PADDLE_ENFORCE_EQ(label->NumLevels(), 1UL,
+                        "The Input(Label) should be a sequence.");
+      const int64_t* label_value = label->data<int64_t>();
+      size_t batch_size = emission_weights->dims()[0];
+      for (size_t i = 0; i < batch_size; ++i) {
+        path[i] = label_value[i] == path[i] ? 1 : 0;
+      }
+    }
+  }
+
+ private:
+  void Decode(const Tensor& emission_weights, const Tensor& transition_weights,
+              Tensor* decoded_path) const {
+    auto emission_dims = emission_weights.dims();
+    const size_t seq_len = emission_dims[0];
+    const size_t tag_num = emission_dims[1];
+
+    const size_t state_trans_base_idx = 2;
+
+    const T* x = emission_weights.data<T>();
+    const T* w = transition_weights.data<T>();
+    int64_t* path = decoded_path->data<int64_t>();
+
+    // alpha is a memo table. An element alpha(k, v) records the score of the
+    // best sequence of tags from position 1 to position k with v being the end
+    // tag.
+    Tensor alpha;
+    T* alpha_value = alpha.mutable_data<T>(emission_dims, platform::CPUPlace());
+    Tensor track;
+    int* track_value =
+        track.mutable_data<int>(emission_dims, platform::CPUPlace());
+
+    for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
+
+    for (size_t k = 1; k < seq_len; ++k) {
+      for (size_t i = 0; i < tag_num; ++i) {
+        T max_score = -std::numeric_limits<T>::max();
+        int max_j = 0;
+        for (size_t j = 0; j < tag_num; ++j) {
+          T score = alpha_value[(k - 1) * tag_num + j] +
+                    w[(j + state_trans_base_idx) * tag_num + i];
+          if (score > max_score) {
+            max_score = score;
+            max_j = j;
+          }
+        }
+
+        alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
+        track_value[k * tag_num + i] = max_j;
+      }
+    }
+
+    T max_score = -std::numeric_limits<T>::max();
+    int max_i = 0;
+    for (size_t i = 0; i < tag_num; ++i) {
+      T score = alpha_value[(seq_len - 1) * tag_num + i] + w[tag_num + i];
+      if (score > max_score) {
+        max_score = score;
+        max_i = i;
+      }
+    }
+    path[seq_len - 1] = max_i;
+    for (int k = seq_len - 1; k >= 1; --k) {
+      path[k - 1] = max_i = track_value[k * tag_num + max_i];
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e80f77e497ee21ccd5322b544376f20cb7de012
--- /dev/null
+++ b/paddle/fluid/operators/crop_op.cc
@@ -0,0 +1,159 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/crop_op.h"
+#include <boost/lexical_cast.hpp>
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class CropOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of CropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of CropOp should not be null.");
+    auto x_dim = ctx->GetInputDim("X");
+    if (!ctx->HasInput("Y")) {
+      auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
+      PADDLE_ENFORCE_EQ(
+          int64_t(shape.size()), x_dim.size(),
+          "Shape size should be equal to dimention size of input tensor.");
+      std::vector<int64_t> tensor_shape(shape.size());
+      for (size_t i = 0; i < shape.size(); ++i) {
+        tensor_shape[i] = static_cast<int64_t>(shape[i]);
+      }
+      ctx->SetOutputDim("Out", framework::make_ddim(tensor_shape));
+    } else {
+      auto y_dim = ctx->GetInputDim("Y");
+      PADDLE_ENFORCE_EQ(framework::arity(x_dim), framework::arity(y_dim),
+                        "Tensor rank of both CropOp's "
+                        "inputs must be same.");
+      ctx->SetOutputDim("Out", y_dim);
+    }
+  }
+};
+
+class CropOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CropOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input of pad op. "
+             "The input should be a k-D tensor(k > 0 and k < 7).");
+    AddInput("Y",
+             "The input used as reference for cropping, "
+             "which is of the same dimensions as X.")
+        .AsDispensable();
+    AddOutput("Out",
+              "The output of crop op, "
+              "which is of the same dimensions as X.");
+    AddAttr<std::vector<int>>("offsets",
+                              "A list<int> describing offsets to be cropped. "
+                              "The size of offsets list should be the same as "
+                              "the dimension size of input X.");
+    AddAttr<std::vector<int>>("shape",
+                              "A list<int> describing the shape of output. "
+                              "The size of shape list should be the same as "
+                              "the dimension size of input X.")
+        .SetDefault(std::vector<int>());
+    AddComment(R"DOC(
+Crop Operator.
+
+Crop input into output, as specified by offsets and shape.
+
+There are two ways to set shape:
+1. reference input: crop input X into the same shape as reference input.
+                    The dimension of reference input should
+                    be the same as the dimension of input X.
+2. shape list: crop input X into the shape described by a list<int>.
+               The size of shape list should be the same as
+               the dimension size of input X.
+
+The input should be a k-D tensor(k > 0 and k < 7). As an example:
+
+Case 1:
+Given
+
+    X = [[0, 1, 2, 0, 0]
+         [0, 3, 4, 0, 0]
+         [0, 0, 0, 0, 0]],
+
+and
+
+    offsets = [0, 1],
+
+and
+
+    shape = [2, 2],
+
+we get:
+
+    Out = [[1, 2],
+           [3, 4]].
+
+
+Case 2:
+Given
+
+    X = [[0, 1, 2, 5, 0]
+         [0, 3, 4, 6, 0]
+         [0, 0, 0, 0, 0]],
+
+and
+
+    offsets = [0, 1],
+
+and
+
+    Y = [[0, 0, 0]
+         [0, 0, 0]],
+
+we get:
+
+    Out = [[1, 2, 5],
+           [3, 4, 6]].
+)DOC");
+  }
+};
+
+class CropOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(crop, ops::CropOp, ops::CropOpMaker, crop_grad, ops::CropOpGrad);
+REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CPU_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/crop_op.cu b/paddle/fluid/operators/crop_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f3610675aae1572380ca9b778ac3251c1951678b
--- /dev/null
+++ b/paddle/fluid/operators/crop_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/crop_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CUDA_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c7c0446d4c0baf1ba59eb860a928341eed7cce0
--- /dev/null
+++ b/paddle/fluid/operators/crop_op.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {  // Internal
+
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+using framework::Tensor;
+
+template <typename T>
+class CropKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    const T* x_data = x->data<T>();
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto x_stride = framework::stride(x->dims());
+    auto out_stride = framework::stride(out->dims());
+    auto offsets = context.Attr<std::vector<int>>("offsets");
+    PADDLE_ENFORCE_EQ(
+        x->dims().size(), static_cast<int64_t>(offsets.size()),
+        "Offsets size should be equal to dimension size of input tensor.");
+    int64_t offset = 0;
+    for (size_t i = 0; i < offsets.size(); ++i) {
+      offset += (x_stride[i] * offsets[i]);
+    }
+    StridedMemcpy<T>(context.device_context(), x_data + offset, x_stride,
+                     out->dims(), out_stride, out_data);
+  }
+};
+
+template <typename DeviceContext, typename T, size_t D>
+void CropGradFunction(const framework::ExecutionContext& context) {
+  auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
+  if (d_x != nullptr) {
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    d_x->mutable_data<T>(context.GetPlace());
+    auto offsets = context.Attr<std::vector<int>>("offsets");
+    Eigen::array<std::pair<int, int>, D> paddings;
+    for (size_t i = 0; i < D; ++i) {
+      paddings[i].first = offsets[i];
+      paddings[i].second = d_x->dims()[i] - d_out->dims()[i] - offsets[i];
+    }
+    auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
+    auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
+    d_x_tensor.device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
+        d_out_tensor.pad(paddings, 0);
+  }
+}
+
+template <typename DeviceContext, typename T>
+class CropGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    size_t rank =
+        context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
+    switch (rank) {
+      case 1:
+        CropGradFunction<DeviceContext, T, 1>(context);
+        break;
+      case 2:
+        CropGradFunction<DeviceContext, T, 2>(context);
+        break;
+      case 3:
+        CropGradFunction<DeviceContext, T, 3>(context);
+        break;
+      case 4:
+        CropGradFunction<DeviceContext, T, 4>(context);
+        break;
+      case 5:
+        CropGradFunction<DeviceContext, T, 5>(context);
+        break;
+      case 6:
+        CropGradFunction<DeviceContext, T, 6>(context);
+        break;
+      default:
+        PADDLE_THROW(
+            "CropOp only support tensors with no more than 6 dimensions.");
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e34b248b6aa696eaa03c7e1b4236a76a9081ef0
--- /dev/null
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -0,0 +1,173 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/cross_entropy_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CrossEntropyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto label_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
+                      "Input(Label)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
+                      "The 1st dimension of Input(X) and Input(Label) should "
+                      "be equal.");
+    if (ctx->Attrs().Get<bool>("soft_label")) {
+      PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
+                        "If Attr(soft_label) == true, the 2nd dimension of "
+                        "Input(X) and Input(Label) should be equal.");
+    } else {
+      PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
+                        "If Attr(softLabel) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
+    }
+
+    ctx->SetOutputDim("Y", {x_dims[0], 1});
+    ctx->ShareLoD("X", /*->*/ "Y");
+  }
+
+ protected:
+  // Explicitly set that the data type of computation kernel of cross_entropy
+  // is determined by its input "X".
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class CrossEntropyGradientOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) shoudl be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto label_dims = ctx->GetInputDim("Label");
+    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
+                      "The 1st dimension of Input(X) and Input(Label) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0],
+                      "The 1st dimension of Input(X) and Input(Y@Grad) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(dy_dims[1], 1,
+                      "The 2nd dimension of Input(Y@Grad) should be 1.");
+    if (ctx->Attrs().Get<bool>("soft_label")) {
+      PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
+                        "When Attr(soft_label) == true, the 2nd dimension of "
+                        "Input(X) and Input(Label) should be equal.");
+    } else {
+      PADDLE_ENFORCE_EQ(label_dims[1], 1,
+                        "When Attr(soft_label) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
+    }
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", framework::GradVarName("X"));
+  }
+
+ protected:
+  // Explicitly set that the data type of computation kernel of cross_entropy
+  // is determined by its input "X".
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x D],"
+             " where N is the batch size and D is the number of classes. "
+             "This input is a probability computed by the previous operator, "
+             "which is almost always the result of a softmax operator.");
+    AddInput("Label",
+             "(Tensor), the ground truth which is a 2-D tensor. When "
+             "soft_label is set to false, Label is a Tensor<int64> with shape "
+             "[N x 1]. When soft_label is set to true, Label is a "
+             "Tensor<float/double> with shape [N x D].");
+    AddOutput("Y",
+              "(Tensor, default Tensor<float>), a 2-D tensor with shape "
+              "[N x 1]. The cross entropy loss.");
+    AddAttr<bool>("soft_label",
+                  "(bool, default false), a flag indicating whether to "
+                  "interpretate the given labels as soft labels.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+CrossEntropy Operator.
+
+It supports both standard cross-entropy and soft-label cross-entropy loss
+computation.
+1) One-hot cross-entropy:
+    soft_label = false, Label[i, 0] indicates the class index for sample i:
+
+                $Y[i] = -\log(X[i, Label[i]])$
+
+2) Soft-label cross-entropy:
+    soft_label = true, Label[i, j] indicates the soft label of class j
+    for sample i:
+
+                $Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}$
+
+   Please make sure that in this case the summuation of each row of Label
+   equals one.
+
+3) One-hot cross-entropy with vecterized Input(Label):
+     As a special case of 2), when each row of Input(Label) has only one
+     non-zero element (equals 1), soft-label cross-entropy degenerates to a
+     one-hot cross-entropy with one-hot label representation.
+
+Both the input X and Label can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD information with input X.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
+            cross_entropy_grad, ops::CrossEntropyGradientOp);
+REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<float>,
+                       ops::CrossEntropyOpKernel<double>);
+REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
+                       ops::CrossEntropyGradientOpKernel<float>,
+                       ops::CrossEntropyGradientOpKernel<double>);
diff --git a/paddle/fluid/operators/cross_entropy_op.cu b/paddle/fluid/operators/cross_entropy_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..de0976c69fc65ebc2ef7df9025d6071545e91c33
--- /dev/null
+++ b/paddle/fluid/operators/cross_entropy_op.cu
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/cross_entropy_op.h"
+
+namespace paddle {
+namespace operators {
+
+namespace {
+
+template <typename T>
+__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
+                                           const int64_t* label, const int N,
+                                           const int D) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    int idx = i * D + label[i];
+    dX[idx] = -dY[i] / X[idx];
+  }
+}
+
+template <typename T>
+__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
+                                               const T* label, const int N,
+                                               const int D) {
+  int ids = blockIdx.x * blockDim.x + threadIdx.x;
+  if (ids < N * D) {
+    int row_ids = ids / D;
+    dX[ids] = -label[ids] * dY[row_ids] / X[ids];
+  }
+}
+}  // namespace
+
+template <typename T>
+class CrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* label = ctx.Input<Tensor>("Label");
+    Tensor* y = ctx.Output<Tensor>("Y");
+    y->mutable_data<T>(ctx.GetPlace());
+
+    math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
+        ctx.template device_context<platform::CUDADeviceContext>(), y, x, label,
+        ctx.Attr<bool>("soft_label"));
+  }
+};
+
+template <typename T>
+class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* label = ctx.Input<Tensor>("Label");
+    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    const T* dy_data =
+        ctx.Input<Tensor>(framework::GradVarName("Y"))->data<T>();
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    const T* x_data = x->data<T>();
+
+    int64_t batch_size = x->dims()[0];
+    int64_t class_num = x->dims()[1];
+
+    int block = 512;
+    int grid = (batch_size * class_num + block - 1) / block;
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto stream = dev_ctx.stream();
+
+    if (ctx.Attr<bool>("soft_label")) {
+      auto* label_data = label->data<T>();
+      SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
+          dx_data, dy_data, x_data, label_data, batch_size, class_num);
+    } else {
+      math::SetConstant<platform::CUDADeviceContext, T> functor;
+      functor(dev_ctx, dx, 0);
+      auto* label_data = label->data<int64_t>();
+      grid = (batch_size + block - 1) / block;
+      CrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
+          dx_data, dy_data, x_data, label_data, batch_size, class_num);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>,
+                        ops::CrossEntropyOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
+                        ops::CrossEntropyGradientOpCUDAKernel<float>,
+                        ops::CrossEntropyGradientOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a5b20ecb70887dd03865e53f168dad818195b16
--- /dev/null
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/cross_entropy.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+class CrossEntropyOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* labels = ctx.Input<Tensor>("Label");
+    Tensor* y = ctx.Output<Tensor>("Y");
+    y->mutable_data<T>(ctx.GetPlace());
+
+    math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
+        ctx.template device_context<platform::CPUDeviceContext>(), y, x, labels,
+        ctx.Attr<bool>("soft_label"));
+  }
+};
+
+template <typename T>
+class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const Tensor* label = ctx.Input<Tensor>("Label");
+    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+
+    int64_t class_num = x->dims()[1];
+    if (ctx.Attr<bool>("soft_label")) {
+      auto x_mat = EigenMatrix<T>::From(*x);
+      auto dy_mat = EigenMatrix<T>::From(*dy);
+      auto lbl_mat = EigenMatrix<T>::From(*label);
+      auto dx_mat = EigenMatrix<T>::From(*dx);
+
+      dx_mat.device(*ctx.template device_context<platform::CPUDeviceContext>()
+                         .eigen_device()) =
+          -(lbl_mat *
+            dy_mat.broadcast(Eigen::DSizes<int64_t, 2>(1, class_num)) / x_mat);
+    } else {
+      int64_t batch_size = x->dims()[0];
+      const T* dy_data = dy->data<T>();
+      const T* x_data = x->data<T>();
+      const int64_t* label_data = label->data<int64_t>();
+
+      math::SetConstant<platform::CPUDeviceContext, T> functor;
+      functor(ctx.template device_context<platform::CPUDeviceContext>(), dx, 0);
+
+      for (int64_t i = 0; i < batch_size; ++i) {
+        PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
+        int64_t index = i * class_num + label_data[i];
+        dx_data[index] = -dy_data[i] / x_data[index];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c7db78813e3bdd90d1c65e3af26208ae2a9ba21
--- /dev/null
+++ b/paddle/fluid/operators/ctc_align_op.cc
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/ctc_align_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CTCAlignOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input of CTCAlignOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                   "Output of CTCAlignOp should not be null.");
+
+    auto input_dims = ctx->GetInputDim("Input");
+
+    // TODO(wanghaoshuang): it is tricky to set the wrong dimension here.
+    ctx->SetOutputDim("Output", input_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CTCAlignOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(LodTensor, default: LoDTensor<int>), Its shape is "
+             "[Lp, 1], where Lp is the sum of all input sequences' length.");
+    AddOutput("Output", "(Tensor, default: Tensor<int>), The align result.");
+    AddAttr<int>("blank",
+                 "(int, default: 0), the blank label setted in Connectionist "
+                 "Temporal Classification (CTC) op.")
+        .SetDefault(0);
+    AddAttr<bool>("merge_repeated",
+                  "(bool, default: true), whether to "
+                  "merge repeated elements between two blanks. ")
+        .SetDefault(true);
+    AddComment(R"DOC(
+CTCAlign op is used to merge repeated elements between two blanks
+and then delete all blanks in sequence.
+
+Given:
+    Input.data = [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6,
+                  6, 0, 0, 7, 7, 7, 0]
+    Input.dims = {18, 1}
+    Input.LoD = [[0, 11, 18]]
+
+And:
+    blank = 0
+    merge_repeated = True
+
+Then:
+    Output.data = [1, 2, 4, 4, 5, 6,
+                   6, 7]
+    Output.dims = {8, 1}
+    Output.LoD = [[0, 6, 8]]
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(ctc_align, ops::CTCAlignOp, ops::CTCAlignOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    ctc_align, ops::CTCAlignKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::CTCAlignKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f629e0a9f15192c4d0d6fa5b8a122811d11ca415
--- /dev/null
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdio.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include "paddle/fluid/operators/ctc_align_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens,
+                                      const size_t num_seq, size_t* lod0,
+                                      const int blank, const int merge_repeated,
+                                      size_t* out_lod0, T* output) {
+  int ouput_idx = 0;
+  out_lod0[0] = 0;
+
+  for (int i = 0; i < num_seq; ++i) {
+    T pre_token = -1;
+    for (int j = lod0[i]; j < lod0[i + 1]; ++j) {
+      if (tokens[j] != blank && !(merge_repeated && tokens[j] == pre_token)) {
+        output[ouput_idx] = tokens[j];
+        ++ouput_idx;
+      }
+      pre_token = tokens[j];
+    }
+    out_lod0[i + 1] = ouput_idx;
+  }
+}
+
+template <typename T>
+class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    const size_t level = 0;
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* output = ctx.Output<LoDTensor>("Output");
+    auto input_lod = framework::ToAbsOffset(input->lod());
+
+    const T* tokens = input->data<T>();
+    const int64_t num_tokens = input->dims()[0];
+    const size_t num_seq = input_lod[level].size() - 1;
+
+    const int blank = ctx.Attr<int>("blank");
+    const int merge_repeated =
+        static_cast<int>(ctx.Attr<bool>("merge_repeated"));
+
+    // prepare a lod to record lod information while merging elements
+    thrust::device_vector<size_t> dev_out_lod0(input_lod[level].size());
+    size_t* dev_out_lod0_ptr = thrust::raw_pointer_cast(dev_out_lod0.data());
+
+    // merge elements and delete blank
+    T* output_data = output->mutable_data<T>({num_tokens, 1}, ctx.GetPlace());
+
+    auto stream = ctx.cuda_device_context().stream();
+    MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
+        num_tokens, tokens, num_seq,
+        input_lod[level].CUDAMutableData(ctx.GetPlace()), blank, merge_repeated,
+        dev_out_lod0_ptr, output_data);
+
+    // set output lod
+    std::vector<size_t> host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end());
+    framework::LoD out_lod;
+    out_lod.push_back(host_out_lod0);
+    output->set_lod(out_lod);
+
+    // resize output dims
+    output->Resize({static_cast<int64_t>(host_out_lod0.back()), 1});
+
+    if (host_out_lod0.back() == 0) {
+      output->Resize({1, 1});
+      output->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+      set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
+                   output, -1);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(ctc_align, paddle::operators::CTCAlignOpCUDAKernel<int>,
+                        paddle::operators::CTCAlignOpCUDAKernel<int64_t>);
diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ef034c2f5b566e3cf720e295953fd7a69dd5812
--- /dev/null
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class CTCAlignKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* output = ctx.Output<LoDTensor>("Output");
+    const size_t level = 0;
+    auto input_lod = framework::ToAbsOffset(input->lod());
+
+    // check input dims and lod
+    auto input_dims = input->dims();
+    PADDLE_ENFORCE_EQ(input_dims[0],
+                      static_cast<int64_t>(input_lod[level].back()),
+                      "The first dimension of Input(Input) should be equal to "
+                      "the sum of all sequences' lengths.");
+
+    const size_t num_sequences = input_lod[level].size() - 1;
+    size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
+    bool merge_repeated = ctx.Attr<bool>("merge_repeated");
+
+    // merge repeated tokens and delete blank
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    size_t output_idx = 0;
+    std::vector<size_t> output_lod0(1, 0);
+    const T* input_data = input->data<T>();
+    for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) {
+      T prev_token = -1;
+      for (size_t i = input_lod[level][seq_idx];
+           i < input_lod[level][seq_idx + 1]; ++i) {
+        if ((unsigned)input_data[i] != blank &&
+            !(merge_repeated && input_data[i] == prev_token)) {
+          output_data[output_idx] = input_data[i];
+          ++output_idx;
+        }
+        prev_token = input_data[i];
+      }
+      output_lod0.push_back(output_idx);
+    }
+
+    // set output lod
+    framework::LoD output_lod;
+    output_lod.push_back(output_lod0);
+    output->set_lod(output_lod);
+    // resize output dims
+    output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
+    // for empty sequence
+    if (output_lod0.back() == 0) {
+      output->Resize({1, 1});
+      output_data = output->mutable_data<T>(ctx.GetPlace());
+      output_data[0] = -1;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b2249147848b790833d09a0abe0370057ddd617
--- /dev/null
+++ b/paddle/fluid/operators/cum_op.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename Functor>
+class CumKernel : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto& X = detail::Ref(context.Input<framework::Tensor>("X"),
+                          "Cannot get input tensor X, variable name = %s",
+                          context.op().Input("X"));
+
+    auto& Out = detail::Ref(context.Output<framework::Tensor>("Out"),
+                            "Cannot get output tensor Out, variable name = %s",
+                            context.op().Output("Out"));
+    int axis = context.Attr<int>("axis");
+    bool exclusive = context.Attr<bool>("exclusive");
+    bool reverse = context.Attr<bool>("reverse");
+    auto x_dims = X.dims();
+    if (axis == -1) {
+      axis = x_dims.size() - 1;
+    }
+    PADDLE_ENFORCE_LT(
+        axis, x_dims.size(),
+        "axis should be less than the dimensiotn of the input tensor");
+    Out.mutable_data<T>(context.GetPlace());
+
+    int pre = 1;
+    int post = 1;
+    int mid = x_dims[axis];
+    for (int i = 0; i < axis; ++i) {
+      pre *= x_dims[i];
+    }
+    for (int i = axis + 1; i < x_dims.size(); ++i) {
+      post *= x_dims[i];
+    }
+
+    auto x = framework::EigenVector<T>::Flatten(X);
+    auto out = framework::EigenVector<T>::Flatten(Out);
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+
+    using IndexT = Eigen::DenseIndex;
+    if (pre == 1) {
+      if (post == 1) {
+        ComputeImp(*place, Eigen::DSizes<IndexT, 1>(mid), x, out,
+                   /* axis= */ 0, reverse, exclusive);
+      } else {
+        ComputeImp(*place, Eigen::DSizes<IndexT, 2>(mid, post), x, out,
+                   /* axis= */ 0, reverse, exclusive);
+      }
+    } else {
+      if (post == 1) {
+        ComputeImp(*place, Eigen::DSizes<IndexT, 2>(pre, mid), x, out,
+                   /* axis= */ 1, reverse, exclusive);
+      } else {
+        ComputeImp(*place, Eigen::DSizes<IndexT, 3>(pre, mid, post), x, out,
+                   /* axis= */ 1, reverse, exclusive);
+      }
+    }
+  }
+
+ private:
+  template <typename Device, typename Dim, typename X, typename Out>
+  void ComputeImp(Device d, const Dim& dims, X x, Out out, int axis,
+                  bool reverse, bool exclusive) const {
+    if (!reverse) {
+      out.reshape(dims).device(d) = Functor()(x.reshape(dims), axis, exclusive);
+    } else {
+      std::array<bool, Dim::count> rev;
+      rev.fill(false);
+      rev[axis] = reverse;
+      out.reshape(dims).device(d) =
+          Functor()(x.reshape(dims).reverse(rev), axis, exclusive).reverse(rev);
+    }
+  }
+};
+
+template <typename T>
+struct CumsumFunctor {
+  using ELEMENT_TYPE = T;
+  template <typename X>
+  const typename X::TensorScanSumOp operator()(X x, int axis,
+                                               bool exclusive) const {
+    return x.cumsum(axis, exclusive);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d15d4e3db35c4cd27f7b990a39a40af57acd5a65
--- /dev/null
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/cum_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CumOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CumsumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Cumsum operator");
+    AddOutput("Out", "Output of Cumsum operator");
+    AddAttr<int>("axis",
+                 "(int, default -1). The dimenstion to accumulate along. "
+                 "-1 means the last dimenstion")
+        .SetDefault(-1)
+        .EqualGreaterThan(-1);
+    AddAttr<bool>("exclusive",
+                  "bool, default false). Whether to perform exclusive cumsum")
+        .SetDefault(false);
+    AddAttr<bool>("reverse",
+                  "bool, default false). If true, the cumsum is performed in "
+                  "the reversed direction")
+        .SetDefault(false);
+    AddComment(R"DOC(
+The cumulative sum of the elements along a given axis.
+By default, the first element of the result is the same of the first element of
+the input. If exlusive is true, the first element of the result is 0.
+)DOC");
+  }
+};
+
+class CumsumGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("cumsum");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("axis", Attr<int>("axis"));
+    grad_op->SetAttr("reverse", !Attr<bool>("reverse"));
+    grad_op->SetAttr("exclusive", Attr<bool>("exclusive"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker);
+REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel<CPU, ops::CumsumFunctor<float>>,
+                       ops::CumKernel<CPU, ops::CumsumFunctor<double>>,
+                       ops::CumKernel<CPU, ops::CumsumFunctor<int>>)
diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e063cc0f65a5d63f8f558c5f16e548a1e1fcd4f6
--- /dev/null
+++ b/paddle/fluid/operators/cumsum_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/cum_op.h"
+
+namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
+
+REGISTER_OP_CUDA_KERNEL(cumsum, ops::CumKernel<CUDA, ops::CumsumFunctor<float>>,
+                        ops::CumKernel<CUDA, ops::CumsumFunctor<double>>,
+                        ops::CumKernel<CUDA, ops::CumsumFunctor<int>>)
diff --git a/paddle/fluid/operators/decayed_adagrad_op.cc b/paddle/fluid/operators/decayed_adagrad_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d827155919ed060df6bb45bbb54c286e81cb6c81
--- /dev/null
+++ b/paddle/fluid/operators/decayed_adagrad_op.cc
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/decayed_adagrad_op.h"
+
+namespace paddle {
+namespace operators {
+
+class DecayedAdagradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment"),
+                   "Input(Moment) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("LearningRate"),
+        "Input(LearningRate) of DecayedAdagradOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
+                   "Output(MomentOut) of DecayedAdagradOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "LearningRate should have one element");
+    auto param_dims = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Grad"),
+                      "Param and Grad input of DecayedAdagradOp should have "
+                      "the same dimension.");
+    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Moment"),
+                      "Param and Moment input of DecayedAdagradOp should have "
+                      "the same dimension.");
+
+    ctx->SetOutputDim("ParamOut", param_dims);
+    ctx->SetOutputDim("MomentOut", param_dims);
+  }
+};
+
+class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  DecayedAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("Moment", "(Tensor) Second moment");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("MomentOut", "(Tensor) Output second moment");
+
+    AddAttr<float>("decay",
+                   "(float, default 0.95) "
+                   "Discounting factor for coming gradient")
+        .SetDefault(0.95);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-6) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-6f);
+    AddComment(R"DOC(
+Decayed Adagrad Optimizer.
+
+The update is done as follows:
+
+$$
+moment\_out = decay * moment + (1 - decay) * grad * grad \\
+param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + epsilon}
+$$
+
+The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+does not have an epsilon attribute. It is added here for numerical
+stability to avoid the division by zero error.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad, ops::DecayedAdagradOp,
+                             ops::DecayedAdagradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    decayed_adagrad,
+    ops::DecayedAdagradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/decayed_adagrad_op.cu b/paddle/fluid/operators/decayed_adagrad_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..215d6dbc7d80405bf2fdd340c280c299de9e8cc7
--- /dev/null
+++ b/paddle/fluid/operators/decayed_adagrad_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/decayed_adagrad_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    decayed_adagrad,
+    ops::DecayedAdagradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/decayed_adagrad_op.h b/paddle/fluid/operators/decayed_adagrad_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..52b67586ea3d138f738956c450416698042590fa
--- /dev/null
+++ b/paddle/fluid/operators/decayed_adagrad_op.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class DecayedAdagradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
+
+    param_out_tensor->mutable_data<T>(ctx.GetPlace());
+    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
+
+    float decay = ctx.Attr<float>("decay");
+    float epsilon = ctx.Attr<float>("epsilon");
+
+    auto param = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Param"));
+    auto grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Grad"));
+    auto moment = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Moment"));
+    auto lr = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("LearningRate"));
+
+    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    moment_out.device(place) = decay * moment + (1 - decay) * grad * grad;
+    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
+    param_out.device(place) =
+        param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt
similarity index 100%
rename from paddle/operators/detail/CMakeLists.txt
rename to paddle/fluid/operators/detail/CMakeLists.txt
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d395d347ba4f48bae6b879c1daa3adb6f838e77
--- /dev/null
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -0,0 +1,189 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "grpc_client.h"
+#include "paddle/fluid/framework/threadpool.h"
+namespace paddle {
+namespace operators {
+namespace detail {
+
+bool RPCClient::AsyncSendVariable(const std::string& ep,
+                                  const platform::DeviceContext& ctx,
+                                  const framework::Scope& scope,
+                                  const std::string& var_name,
+                                  int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] {
+    auto* var = p_scope->FindVar(var_name_val);
+    sendrecv::VariableMessage req;
+    SerializeToMessage(var_name_val, var, *p_ctx, &req);
+
+    // varhandle
+    VarHandle var_h;
+    var_h.ep = ep_val;
+    var_h.scope = p_scope;
+    var_h.name = var_name_val;
+    var_h.ctx = p_ctx;
+
+    // stub context
+    SendProcessor* s = new SendProcessor(ch);
+    s->Prepare(var_h, time_out);
+    s->response_call_back_ = NULL;
+
+    auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+    rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  });
+
+  req_count_++;
+
+  return true;
+}
+
+void ProcGetResponse(const VarHandle& var_h,
+                     const sendrecv::VariableMessage& ret_msg) {
+  auto* outvar = var_h.scope->FindVar(var_h.name);
+  DeserializeFromMessage(ret_msg, *var_h.ctx, outvar);
+}
+
+bool RPCClient::AsyncGetVariable(const std::string& ep,
+                                 const platform::DeviceContext& ctx,
+                                 const framework::Scope& scope,
+                                 const std::string& var_name,
+                                 int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {
+    sendrecv::VariableMessage req;
+    req.set_varname(var_name_val);
+
+    // varhandle
+    VarHandle var_h;
+    var_h.ep = ep_val;
+    var_h.scope = p_scope;
+    var_h.name = var_name_val;
+    var_h.ctx = p_ctx;
+
+    // stub context
+    GetProcessor* s = new GetProcessor(ch);
+    s->Prepare(var_h, time_out);
+    s->response_call_back_ = ProcGetResponse;
+
+    auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
+    rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  });
+
+  req_count_++;
+
+  return true;
+}
+
+bool RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
+  const auto ch = GetChannel(ep);
+
+  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
+  s->Prepare(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(BATCH_BARRIER_MESSAGE);
+  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  req_count_++;
+
+  return true;
+}
+
+bool RPCClient::Wait() {
+  if (req_count_ <= 0) {
+    return true;
+  }
+  const size_t kReqCnt = req_count_;
+  bool a[kReqCnt];
+  std::vector<std::future<void>> waits(req_count_);
+
+  for (int i = 0; i < req_count_; i++) {
+    waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); });
+  }
+
+  for (int i = 0; i < req_count_; i++) {
+    waits[i].wait();
+  }
+
+  int last_req_count = req_count_;
+  req_count_ = 0;
+
+  for (int i = 0; i < last_req_count; i++) {
+    if (!a[i]) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool RPCClient::Proceed() {
+  void* tag = NULL;
+  bool ok = false;
+
+  // request counts.
+  if (!cq_.Next(&tag, &ok)) {
+    LOG(ERROR) << "Get meets CompletionQueue error";
+    return false;
+  }
+
+  GPR_ASSERT(ok);
+  PADDLE_ENFORCE(tag);
+
+  // TODO(gongwb): add more retries.
+  ClientBase* c = static_cast<ClientBase*>(tag);
+  if (!c->status_.ok()) {
+    LOG(ERROR) << "proc param error:" << c->var_h_.String()
+               << " grpc error:" << c->status_.error_message();
+    delete c;
+    return false;
+  }
+
+  c->Process();
+  delete c;
+  return true;
+}
+
+std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
+  auto it = channels_.find(ep);
+  if (it != channels_.end()) {
+    return it->second;
+  }
+
+  grpc::ChannelArguments args;
+  args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
+  args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
+
+  auto ch = std::shared_ptr<grpc::Channel>(
+      grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args));
+
+  channels_[ep] = ch;
+  return ch;
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..314fe8168f0ecdae7b5d2737279050f54185e02a
--- /dev/null
+++ b/paddle/fluid/operators/detail/grpc_client.h
@@ -0,0 +1,171 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <grpc++/grpc++.h>
+#include <grpc/support/log.h>
+#include <time.h>
+#include <chrono>
+#include <ctime>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/detail/simple_block_queue.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+struct VarHandle {
+  std::string ep;
+  const platform::DeviceContext* ctx;
+  const framework::Scope* scope;
+  std::string name;
+
+  std::string String() const {
+    std::ostringstream s;
+    s << "name:[" << name << "] ep:[" << ep << "]";
+    return s.str();
+  }
+};
+
+void ProcGetResponse(const VarHandle& var_h,
+                     const sendrecv::VariableMessage& msg);
+
+class ClientBase {
+ public:
+  explicit ClientBase(std::shared_ptr<grpc::Channel> ch) {
+    stub_ = sendrecv::SendRecvService::NewStub(ch);
+    context_ = NULL;
+  }
+
+  virtual ~ClientBase() {}
+
+  virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
+    context_.reset(new grpc::ClientContext());
+    var_h_ = var_info;
+
+    std::chrono::system_clock::time_point deadline =
+        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
+
+    context_->set_deadline(deadline);
+  }
+
+  virtual void Prepare(int64_t time_out) {
+    context_.reset(new grpc::ClientContext());
+
+    std::chrono::system_clock::time_point deadline =
+        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
+
+    context_->set_deadline(deadline);
+  }
+
+  virtual void Process() = 0;
+
+  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
+  std::unique_ptr<grpc::ClientContext> context_;
+  grpc::Status status_;
+  VarHandle var_h_;
+};
+
+typedef std::function<void(const VarHandle&, const sendrecv::VoidMessage&)>
+    RequestSendCallBack;
+
+class SendProcessor : public ClientBase {
+ public:
+  explicit SendProcessor(std::shared_ptr<grpc::Channel> ch) : ClientBase(ch) {}
+
+  virtual ~SendProcessor() {}
+
+  virtual void Process() {
+    if (response_call_back_) {
+      response_call_back_(var_h_, reply_);
+    }
+  }
+
+  sendrecv::VoidMessage reply_;
+  RequestSendCallBack response_call_back_ = NULL;
+};
+
+typedef std::function<void(const VarHandle&, const sendrecv::VariableMessage&)>
+    RequestGetCallBack;
+
+class GetProcessor : public ClientBase {
+ public:
+  explicit GetProcessor(std::shared_ptr<grpc::Channel> ch) : ClientBase(ch) {}
+
+  virtual ~GetProcessor() {}
+
+  virtual void Process() {
+    if (response_call_back_) {
+      response_call_back_(var_h_, reply_);
+    }
+  }
+
+  sendrecv::VariableMessage reply_;
+  RequestGetCallBack response_call_back_ = ProcGetResponse;
+};
+
+class BatchBarrierProcessor : public ClientBase {
+ public:
+  explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
+      : ClientBase(ch) {}
+
+  virtual ~BatchBarrierProcessor() {}
+
+  virtual void Process() {}
+  sendrecv::VoidMessage reply_;
+};
+
+class RPCClient {
+ public:
+  bool AsyncSendVariable(const std::string& ep,
+                         const platform::DeviceContext& ctx,
+                         const framework::Scope& scope,
+                         const std::string& var_name,
+                         int64_t time_out = 600 * 1000);
+
+  bool AsyncGetVariable(const std::string& ep,
+                        const platform::DeviceContext& ctx,
+                        const framework::Scope& scope,
+                        const std::string& var_name,
+                        int64_t time_out = 600 * 1000);
+
+  bool AsyncSendBatchBarrier(const std::string& ep,
+                             int64_t time_out = 600 * 1000);
+
+  bool Wait();
+
+ private:
+  bool Proceed();
+  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
+
+ private:
+  grpc::CompletionQueue cq_;
+  std::map<std::string, std::shared_ptr<grpc::Channel>> channels_;
+  int64_t req_count_ = 0;
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
new file mode 100644
index 0000000000000000000000000000000000000000..96f4ea797b1d8e82fa1c2a52a8b353259906bac2
--- /dev/null
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -0,0 +1,256 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detail/grpc_server.h"
+
+using grpc::ServerAsyncResponseWriter;
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+enum CallStatus { PROCESS = 0, FINISH };
+
+// reference:
+// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
+class RequestBase {
+ public:
+  explicit RequestBase(sendrecv::SendRecvService::AsyncService* service,
+                       grpc::ServerCompletionQueue* cq)
+      : service_(service), cq_(cq), status_(PROCESS) {
+    PADDLE_ENFORCE(cq_);
+  }
+  virtual ~RequestBase() {}
+  virtual void Process() { assert(false); }
+
+  CallStatus Status() { return status_; }
+  void SetStatus(CallStatus status) { status_ = status; }
+  virtual std::string GetReqName() {
+    assert(false);
+    return "";
+  }
+
+ protected:
+  grpc::ServerContext ctx_;
+  sendrecv::SendRecvService::AsyncService* service_;
+  grpc::ServerCompletionQueue* cq_;
+  CallStatus status_;
+};
+
+typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
+
+class RequestSend final : public RequestBase {
+ public:
+  explicit RequestSend(sendrecv::SendRecvService::AsyncService* service,
+                       grpc::ServerCompletionQueue* cq,
+                       SimpleBlockQueue<MessageWithName>* queue)
+      : RequestBase(service, cq), queue_(queue), responder_(&ctx_) {
+    service_->RequestSendVariable(&ctx_, &request_, &responder_, cq_, cq_,
+                                  this);
+  }
+
+  virtual ~RequestSend() {}
+
+  virtual std::string GetReqName() { return request_.varname(); }
+
+  virtual void Process() {
+    MessageWithName msg_with_name =
+        std::make_pair(request_.varname(), std::move(request_));
+    queue_->Push(std::move(msg_with_name));
+    responder_.Finish(reply_, grpc::Status::OK, this);
+    status_ = FINISH;
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  sendrecv::VoidMessage reply_;
+  SimpleBlockQueue<MessageWithName>* queue_;
+  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
+};
+
+class RequestGet final : public RequestBase {
+ public:
+  explicit RequestGet(sendrecv::SendRecvService::AsyncService* service,
+                      grpc::ServerCompletionQueue* cq, framework::Scope* scope,
+                      const platform::DeviceContext* dev_ctx,
+                      SimpleBlockQueue<char>* queue)
+      : RequestBase(service, cq),
+        responder_(&ctx_),
+        scope_(scope),
+        dev_ctx_(dev_ctx),
+        queue_(queue) {
+    service_->RequestGetVariable(&ctx_, &request_, &responder_, cq_, cq_, this);
+  }
+
+  virtual ~RequestGet() {}
+
+  virtual std::string GetReqName() { return request_.varname(); }
+
+  virtual void Process() {
+    // proc request.
+    std::string var_name = request_.varname();
+    auto* var = scope_->FindVar(var_name);
+    SerializeToMessage(var_name, var, *dev_ctx_, &reply_);
+    // TODO(gongwb): check var's info.
+    responder_.Finish(reply_, grpc::Status::OK, this);
+    status_ = FINISH;
+    queue_->Push('c');
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  sendrecv::VariableMessage reply_;
+  ServerAsyncResponseWriter<sendrecv::VariableMessage> responder_;
+  framework::Scope* scope_;
+  const platform::DeviceContext* dev_ctx_;
+  SimpleBlockQueue<char>* queue_;
+};
+
+void AsyncGRPCServer::WaitClientGet(int count) {
+  for (int i = 0; i < count; ++i) {
+    var_get_queue_.Pop();
+  }
+}
+
+void AsyncGRPCServer::RunSyncUpdate() {
+  grpc::ServerBuilder builder;
+  builder.AddListeningPort(address_, grpc::InsecureServerCredentials());
+  builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
+  builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
+  builder.RegisterService(&service_);
+
+  cq_send_ = builder.AddCompletionQueue();
+  cq_get_ = builder.AddCompletionQueue();
+
+  server_ = builder.BuildAndStart();
+  LOG(INFO) << "Server listening on " << address_ << std::endl;
+
+  std::function<void()> send_register =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
+  std::function<void()> get_register =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
+
+  t_send_.reset(
+      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
+                                cq_send_.get(), "cq_send", send_register)));
+
+  t_get_.reset(
+      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
+                                cq_get_.get(), "cq_get", get_register)));
+
+  // wait server
+  server_->Wait();
+  t_send_->join();
+  t_get_->join();
+}
+
+void AsyncGRPCServer::ShutdownQueue() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  cq_send_->Shutdown();
+  cq_get_->Shutdown();
+  is_shut_down_ = true;
+}
+
+// This URL explains why shutdown is complicate:
+void AsyncGRPCServer::ShutDown() {
+  server_->Shutdown();
+  ShutdownQueue();
+}
+
+void AsyncGRPCServer::TryToRegisterNewSendOne() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  if (is_shut_down_) {
+    return;
+  }
+  RequestSend* send =
+      new RequestSend(&service_, cq_send_.get(), &var_recv_queue_);
+  VLOG(4) << "Create RequestSend status:" << send->Status();
+}
+
+void AsyncGRPCServer::TryToRegisterNewGetOne() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  if (is_shut_down_) {
+    return;
+  }
+  RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_,
+                                   &var_get_queue_);
+  VLOG(4) << "Create RequestGet status:" << get->Status();
+}
+
+// FIXME(typhoonzero): change cq_name to enum.
+void AsyncGRPCServer::HandleRequest(grpc::ServerCompletionQueue* cq,
+                                    std::string cq_name,
+                                    std::function<void()> TryToRegisterNewOne) {
+  TryToRegisterNewOne();
+
+  void* tag = NULL;
+  bool ok = false;
+  while (true) {
+    if (!cq->Next(&tag, &ok)) {
+      LOG(INFO) << cq_name << " get CompletionQueue shutdown!";
+      break;
+    }
+
+    PADDLE_ENFORCE(tag);
+    // FIXME(typhoonzero): de-couple the barriers with recv_op
+    if (cq_name == "cq_get") WaitCond(1);
+    if (cq_name == "cq_send") WaitCond(0);
+
+    RequestBase* base = (RequestBase*)tag;
+    // reference:
+    // https://github.com/tensorflow/tensorflow/issues/5596
+    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
+    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
+    if (!ok) {
+      LOG(WARNING) << cq_name << " recv no regular event:argument name"
+                   << base->GetReqName();
+      TryToRegisterNewOne();
+      delete base;
+      continue;
+    }
+
+    switch (base->Status()) {
+      case PROCESS: {
+        VLOG(4) << cq_name << " status:" << base->Status();
+        TryToRegisterNewOne();
+        base->Process();
+        break;
+      }
+      case FINISH: {
+        VLOG(4) << cq_name << " status:" << base->Status();
+        delete base;
+        break;
+      }
+      default: { assert(false); }
+    }
+  }
+}
+
+void AsyncGRPCServer::WaitCond(int cond) {
+  std::unique_lock<std::mutex> lock(this->barrier_mutex_);
+  barrier_condition_.wait(lock,
+                          [=] { return this->barrier_cond_step_ == cond; });
+}
+
+void AsyncGRPCServer::SetCond(int cond) {
+  {
+    std::lock_guard<std::mutex> lock(this->barrier_mutex_);
+    barrier_cond_step_ = cond;
+  }
+  barrier_condition_.notify_all();
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
new file mode 100644
index 0000000000000000000000000000000000000000..1382d1731838c72008ef782d1c398f534f23f7e6
--- /dev/null
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/detail/simple_block_queue.h"
+
+#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/detail/send_recv.pb.h"
+
+#include <grpc++/grpc++.h>
+#include <grpc/support/log.h>
+#include <thread>
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
+class RequestBase;
+
+class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
+ public:
+  explicit AsyncGRPCServer(const std::string &address) : address_(address) {}
+
+  void RunSyncUpdate();
+
+  // functions to sync server barrier status.
+  void WaitCond(int cond);
+  void SetCond(int cond);
+  void WaitClientGet(int count);
+
+  void SetScope(framework::Scope *scope) { scope_ = scope; }
+
+  void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; }
+
+  const MessageWithName Get() { return this->var_recv_queue_.Pop(); }
+
+  void Push(const MessageWithName &msg) { this->var_recv_queue_.Push(msg); }
+
+  void ShutDown();
+
+ protected:
+  void HandleRequest(grpc::ServerCompletionQueue *cq, std::string cq_name,
+                     std::function<void()> TryToRegisterNewOne);
+  void TryToRegisterNewSendOne();
+  void TryToRegisterNewGetOne();
+  void ShutdownQueue();
+
+ private:
+  std::mutex cq_mutex_;
+  volatile bool is_shut_down_ = false;
+  std::unique_ptr<grpc::ServerCompletionQueue> cq_send_;
+  std::unique_ptr<grpc::ServerCompletionQueue> cq_get_;
+
+  sendrecv::SendRecvService::AsyncService service_;
+  std::unique_ptr<grpc::Server> server_;
+
+  std::string address_;
+  framework::Scope *scope_;
+  const platform::DeviceContext *dev_ctx_;
+  // received variable from RPC, operators fetch variable from this queue.
+  SimpleBlockQueue<MessageWithName> var_recv_queue_;
+  SimpleBlockQueue<char> var_get_queue_;
+
+  // condition of the sub program
+  std::mutex barrier_mutex_;
+  mutable int barrier_cond_step_;
+  std::condition_variable barrier_condition_;
+
+  std::unique_ptr<std::thread> t_send_;
+  std::unique_ptr<std::thread> t_get_;
+};
+
+};  // namespace detail
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/operators/detail/safe_ref.h b/paddle/fluid/operators/detail/safe_ref.h
similarity index 100%
rename from paddle/operators/detail/safe_ref.h
rename to paddle/fluid/operators/detail/safe_ref.h
diff --git a/paddle/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto
similarity index 100%
rename from paddle/operators/detail/send_recv.proto
rename to paddle/fluid/operators/detail/send_recv.proto
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ba3ae6add6099cb232a5e0df82550b9c2628c05c
--- /dev/null
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+void SerializeToMessage(const std::string& name, const framework::Variable* var,
+                        const platform::DeviceContext& ctx,
+                        sendrecv::VariableMessage* msg) {
+  msg->set_varname(name);
+  std::ostringstream oss;
+  switch (framework::ToVarType(var->Type())) {
+    case framework::proto::VarDesc_VarType_LOD_TENSOR:
+      msg->set_type(sendrecv::VarType::LOD_TENSOR);
+      framework::SerializeToStream(oss, var->Get<framework::LoDTensor>(), ctx);
+      break;
+    case framework::proto::VarDesc_VarType_SELECTED_ROWS:
+      msg->set_type(sendrecv::VarType::SELECTED_ROWS);
+      framework::SerializeToStream(oss, var->Get<framework::SelectedRows>(),
+                                   ctx);
+      break;
+    default: {
+      PADDLE_THROW("Serialize does not support type: %s",
+                   typeid(var->Type()).name());
+      break;
+    }
+  }
+  msg->set_serialized(oss.str());
+}
+
+void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
+                            const platform::DeviceContext& ctx,
+                            framework::Variable* var) {
+  std::istringstream iss(msg.serialized());
+  switch (msg.type()) {
+    case sendrecv::VarType::LOD_TENSOR:
+      DeserializeFromStream(iss, var->GetMutable<framework::LoDTensor>(), ctx);
+      break;
+    case sendrecv::VarType::SELECTED_ROWS: {
+      DeserializeFromStream(iss, var->GetMutable<framework::SelectedRows>(),
+                            ctx);
+      break;
+    }
+    default: {
+      PADDLE_THROW("Deserialize does not support type: %s",
+                   typeid(var->Type()).name());
+      break;
+    }
+  }
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..fed887c02796463b4b1b7a747883c702c2a95a72
--- /dev/null
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+
+#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/detail/send_recv.pb.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
+#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
+
+void SerializeToMessage(const std::string& name, const framework::Variable* var,
+                        const platform::DeviceContext& ctx,
+                        sendrecv::VariableMessage* msg);
+
+void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
+                            const platform::DeviceContext& ctx,
+                            framework::Variable* var);
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/simple_block_queue.h b/paddle/fluid/operators/detail/simple_block_queue.h
similarity index 100%
rename from paddle/operators/detail/simple_block_queue.h
rename to paddle/fluid/operators/detail/simple_block_queue.h
diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7a7eed50b961b0efc04a2a636178fa6578cbf3a
--- /dev/null
+++ b/paddle/fluid/operators/detail/strided_memcpy.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+template <typename T, int Rank>
+struct StridedMemcpyFunctor;
+
+template <typename T>
+struct StridedMemcpyFunctor<T, 1> {
+  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
+                  framework::Dim<1> src_stride, framework::Dim<1> dst_dim,
+                  framework::Dim<1> dst_stride, T* dst) const {
+    auto place = dev_ctx.GetPlace();
+    if (platform::is_cpu_place(place)) {
+      auto& cpu_place = boost::get<platform::CPUPlace>(place);
+      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head);
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      auto& gpu_place = boost::get<platform::CUDAPlace>(place);
+      auto& cuda_ctx =
+          reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
+      memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head,
+                   cuda_ctx.stream());
+#else
+      PADDLE_THROW("Paddle is not compiled with GPU");
+#endif
+    }
+  }
+};
+
+template <typename T, int Rank>
+struct StridedMemcpyFunctor {
+  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
+                  framework::Dim<Rank> src_stride, framework::Dim<Rank> dst_dim,
+                  framework::Dim<Rank> dst_stride, T* dst) const {
+    for (int64_t i = 0; i < dst_dim.head; ++i) {
+      StridedMemcpyFunctor<T, Rank - 1> func;
+      func(dev_ctx, src, src_stride.tail, dst_dim.tail, dst_stride.tail, dst);
+      src += src_stride.head;
+      dst += dst_stride.head;
+    }
+  }
+};
+
+template <typename T>
+struct StridedCopyDimVisitor : public boost::static_visitor<void> {
+  StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src,
+                        const framework::DDim& src_stride,
+                        const framework::DDim& dst_stride, T* dst)
+      : dev_ctx_(dev_ctx),
+        src_(src),
+        src_stride_(src_stride),
+        dst_stride_(dst_stride),
+        dst_(dst) {}
+
+  template <typename Dim>
+  void operator()(Dim dst_dim) const {
+    Dim src_stride = boost::get<Dim>(src_stride_);
+    Dim dst_stride = boost::get<Dim>(dst_stride_);
+    constexpr int dim = Dim::dimensions;
+    StridedMemcpyFunctor<T, dim> functor;
+    functor(dev_ctx_, src_, src_stride, dst_dim, dst_stride, dst_);
+  }
+
+  const platform::DeviceContext& dev_ctx_;
+  const T* src_;
+  const framework::DDim& src_stride_;
+  const framework::DDim& dst_stride_;
+  T* dst_;
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection_output_op.cc b/paddle/fluid/operators/detection_output_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6dee5222959f00141cf5c09257d4a2c96b9e3746
--- /dev/null
+++ b/paddle/fluid/operators/detection_output_op.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection_output_op.h"
+namespace paddle {
+namespace operators {
+
+class DetectionOutputOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  DetectionOutputOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Loc",
+             "(Tensor) The input tensor of detection_output operator."
+             "The input predict locations"
+             "The format of input tensor is kNCHW. Where K is priorbox point "
+             "numbers,"
+             "N is How many boxes are there on each point, "
+             "C is 4, H and W both are 1.");
+    AddInput("Conf",
+             "(Tensor) The input tensor of detection_output operator."
+             "The input priorbox confidence."
+             "The format of input tensor is kNCHW. Where K is priorbox point "
+             "numbers,"
+             "N is How many boxes are there on each point, "
+             "C is the number of classes, H and W both are 1.");
+    AddInput("PriorBox",
+             "(Tensor) The input tensor of detection_output operator."
+             "The format of input tensor is the position and variance "
+             "of the boxes");
+    AddOutput("Out",
+              "(Tensor) The output tensor of detection_output operator.");
+    AddAttr<int>("background_label_id", "(int), The background class index.");
+    AddAttr<int>("num_classes", "(int), The number of the classification.");
+    AddAttr<float>("nms_threshold",
+                   "(float), The Non-maximum suppression threshold.");
+    AddAttr<float>("confidence_threshold",
+                   "(float), The classification confidence threshold.");
+    AddAttr<int>("top_k", "(int), The bbox number kept of the layer’s output.");
+    AddAttr<int>("nms_top_k",
+                 "(int), The bbox number kept of the NMS’s output.");
+    AddComment(R"DOC(
+          detection output for SSD(single shot multibox detector)
+          Apply the NMS to the output of network and compute the predict
+          bounding box location. The output’s shape of this layer could
+          be zero if there is no valid bounding box.
+        )DOC");
+  }
+};
+
+class DetectionOutputOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Loc"),
+                   "Input(X) of DetectionOutputOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Conf"),
+                   "Input(X) of DetectionOutputOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
+                   "Input(X) of DetectionOutputOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of DetectionOutputOp should not be null.");
+    std::vector<int64_t> output_shape({1, 7});
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(detection_output, ops::DetectionOutputOp,
+                             ops::DetectionOutputOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    detection_output,
+    ops::DetectionOutputKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::DetectionOutputKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/detection_output_op.cu.cc b/paddle/fluid/operators/detection_output_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..309e03a25be95362b11689f873bafe68570c42e4
--- /dev/null
+++ b/paddle/fluid/operators/detection_output_op.cu.cc
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection_output_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    detection_output,
+    ops::DetectionOutputKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DetectionOutputKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/detection_output_op.h b/paddle/fluid/operators/detection_output_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..05e5b72bd354329d575c33a88189cfbc64abfea9
--- /dev/null
+++ b/paddle/fluid/operators/detection_output_op.h
@@ -0,0 +1,167 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/detection_util.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+inline void transpose_fun(const framework::ExecutionContext& context,
+                          const framework::Tensor& src,
+                          framework::Tensor* dst) {
+  int input_nums = src.dims()[0];
+  int offset = 0;
+  for (int j = 0; j < input_nums; ++j) {
+    framework::Tensor in_p_tensor = src.Slice(j, j + 1);
+    std::vector<int64_t> shape_vec(
+        {in_p_tensor.dims()[0], in_p_tensor.dims()[1], in_p_tensor.dims()[3],
+         in_p_tensor.dims()[4], in_p_tensor.dims()[2]});
+    framework::DDim shape(framework::make_ddim(shape_vec));
+    framework::Tensor in_p_tensor_transpose;
+    in_p_tensor_transpose.mutable_data<T>(shape, context.GetPlace());
+    std::vector<int> shape_axis({0, 1, 3, 4, 2});
+    math::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context.template device_context<DeviceContext>(), in_p_tensor,
+           &in_p_tensor_transpose, shape_axis);
+    auto dst_stride = framework::stride(dst->dims());
+    auto src_stride = framework::stride(in_p_tensor_transpose.dims());
+    StridedMemcpy<T>(context.device_context(), in_p_tensor_transpose.data<T>(),
+                     src_stride, in_p_tensor_transpose.dims(), dst_stride,
+                     dst->data<T>() + offset);
+    offset += in_p_tensor_transpose.dims()[4] * src_stride[4];
+  }
+}
+template <typename DeviceContext, typename T>
+class DetectionOutputKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_loc = context.Input<framework::Tensor>("Loc");
+    const framework::Tensor* in_conf = context.Input<framework::Tensor>("Conf");
+    const framework::Tensor* in_priorbox =
+        context.Input<framework::Tensor>("PriorBox");
+    auto* out = context.Output<framework::Tensor>("Out");
+    int num_classes = context.template Attr<int>("num_classes");
+    int top_k = context.template Attr<int>("top_k");
+    int nms_top_k = context.template Attr<int>("nms_top_k");
+    int background_label_id = context.template Attr<int>("background_label_id");
+    float nms_threshold = context.template Attr<float>("nms_threshold");
+    float confidence_threshold =
+        context.template Attr<float>("confidence_threshold");
+    size_t batch_size = in_conf->dims()[1];
+    int conf_sum_size = in_conf->numel();
+    // for softmax
+    std::vector<int64_t> conf_shape_softmax_vec(
+        {conf_sum_size / num_classes, num_classes});
+    framework::DDim conf_shape_softmax(
+        framework::make_ddim(conf_shape_softmax_vec));
+    // for knchw => nhwc
+    std::vector<int64_t> loc_shape_vec({1, in_loc->dims()[1], in_loc->dims()[3],
+                                        in_loc->dims()[4],
+                                        in_loc->dims()[2] * in_loc->dims()[0]});
+    std::vector<int64_t> conf_shape_vec(
+        {1, in_conf->dims()[1], in_conf->dims()[3], in_conf->dims()[4],
+         in_conf->dims()[2] * in_conf->dims()[0]});
+    framework::DDim loc_shape(framework::make_ddim(loc_shape_vec));
+    framework::DDim conf_shape(framework::make_ddim(conf_shape_vec));
+    framework::Tensor loc_tensor;
+    framework::Tensor conf_tensor;
+    loc_tensor.mutable_data<T>(loc_shape, context.GetPlace());
+    conf_tensor.mutable_data<T>(conf_shape, context.GetPlace());
+    // for cpu
+    framework::Tensor loc_cpu;
+    framework::Tensor conf_cpu;
+    framework::Tensor priorbox_cpu;
+    const T* priorbox_data = in_priorbox->data<T>();
+    transpose_fun<DeviceContext, T>(context, *in_loc, &loc_tensor);
+    transpose_fun<DeviceContext, T>(context, *in_conf, &conf_tensor);
+    conf_tensor.Resize(conf_shape_softmax);
+    math::SoftmaxFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), &conf_tensor,
+        &conf_tensor);
+    T* loc_data = loc_tensor.data<T>();
+    T* conf_data = conf_tensor.data<T>();
+    if (platform::is_gpu_place(context.GetPlace())) {
+      loc_cpu.mutable_data<T>(loc_tensor.dims(), platform::CPUPlace());
+      framework::Copy(loc_tensor, platform::CPUPlace(),
+                      context.device_context(), &loc_cpu);
+      loc_data = loc_cpu.data<T>();
+      conf_cpu.mutable_data<T>(conf_tensor.dims(), platform::CPUPlace());
+      framework::Copy(conf_tensor, platform::CPUPlace(),
+                      context.device_context(), &conf_cpu);
+      conf_data = conf_cpu.data<T>();
+      priorbox_cpu.mutable_data<T>(in_priorbox->dims(), platform::CPUPlace());
+      framework::Copy(*in_priorbox, platform::CPUPlace(),
+                      context.device_context(), &priorbox_cpu);
+      priorbox_data = priorbox_cpu.data<T>();
+    }
+    // get decode bboxes
+    size_t num_priors = in_priorbox->numel() / 8;
+    std::vector<std::vector<operators::math::BBox<T>>> all_decoded_bboxes;
+    for (size_t n = 0; n < batch_size; ++n) {
+      std::vector<operators::math::BBox<T>> decoded_bboxes;
+      for (size_t i = 0; i < num_priors; ++i) {
+        size_t prior_offset = i * 8;
+        size_t loc_pred_offset = n * num_priors * 4 + i * 4;
+        std::vector<math::BBox<T>> prior_bbox_vec;
+        math::GetBBoxFromPriorData<T>(priorbox_data + prior_offset, 1,
+                                      prior_bbox_vec);
+        std::vector<std::vector<T>> prior_bbox_var;
+        math::GetBBoxVarFromPriorData<T>(priorbox_data + prior_offset, 1,
+                                         prior_bbox_var);
+        std::vector<T> loc_pred_data;
+        for (size_t j = 0; j < 4; ++j)
+          loc_pred_data.push_back(*(loc_data + loc_pred_offset + j));
+        math::BBox<T> bbox = math::DecodeBBoxWithVar<T>(
+            prior_bbox_vec[0], prior_bbox_var[0], loc_pred_data);
+        decoded_bboxes.push_back(bbox);
+      }
+      all_decoded_bboxes.push_back(decoded_bboxes);
+    }
+    std::vector<std::map<size_t, std::vector<size_t>>> all_indices;
+    int num_kept = math::GetDetectionIndices<T>(
+        conf_data, num_priors, num_classes, background_label_id, batch_size,
+        confidence_threshold, nms_top_k, nms_threshold, top_k,
+        all_decoded_bboxes, &all_indices);
+
+    if (num_kept <= 0) {
+      std::vector<int64_t> out_shape_vec({0, 0});
+      framework::DDim out_shape(framework::make_ddim(out_shape_vec));
+      out->Resize(out_shape);
+      return;
+    }
+    std::vector<int64_t> out_shape_vec({num_kept, 7});
+    framework::DDim out_shape(framework::make_ddim(out_shape_vec));
+    out->mutable_data<T>(out_shape, context.GetPlace());
+    framework::Tensor out_cpu;
+    T* out_data = out->data<T>();
+    if (platform::is_gpu_place(context.GetPlace())) {
+      out_cpu.mutable_data<T>(out->dims(), platform::CPUPlace());
+      out_data = out_cpu.data<T>();
+    }
+    math::GetDetectionOutput<T>(conf_data, num_kept, num_priors, num_classes,
+                                batch_size, all_indices, all_decoded_bboxes,
+                                out_data);
+    if (platform::is_gpu_place(context.GetPlace())) {
+      framework::Copy(out_cpu, platform::CUDAPlace(), context.device_context(),
+                      out);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e1dc900512cb2c20cc1a39b5d11a78f5eb905dc5
--- /dev/null
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/dropout_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class DropoutOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", x_dims);
+    if (ctx->Attrs().Get<bool>("is_test") == false) {
+      ctx->SetOutputDim("Mask", x_dims);
+    }
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+template <typename AttrType>
+class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of dropout op.");
+    AddOutput("Out", "The output of dropout op.");
+    AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate();
+
+    AddAttr<float>("dropout_prob", "Probability of setting units to zero.")
+        .SetDefault(.5f)
+        .AddCustomChecker([](const float& drop_p) {
+          PADDLE_ENFORCE(drop_p >= 0.0f && drop_p <= 1.0f,
+                         "'dropout_prob' must be between 0.0 and 1.0.");
+        });
+    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
+    AddAttr<bool>("fix_seed",
+                  "A flag indicating whether to use a fixed seed to generate "
+                  "random mask. NOTE: DO NOT set this flag to true in "
+                  "training. Setting this flag to true is only useful in "
+                  "unittest or for debug that always the same output units "
+                  "will be dropped.")
+        .SetDefault(false);
+    AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
+
+    AddComment(R"DOC(
+Dropout Operator.
+
+Dropout refers to randomly dropping out units in a nerual network. It is a
+regularization technique for reducing overfitting by preventing neuron
+co-adaption during training. The dropout operator randomly set (according to
+the given dropout probability) the outputs of some units to zero, while others
+are set equal to their corresponding inputs.
+
+)DOC");
+  }
+};
+
+template <typename AttrType>
+class DropoutOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
+                      "GradOp is only callable when is_test is false");
+
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Mask"), "Mask must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) must not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    PADDLE_ENFORCE_EQ(x_dims, out_dims,
+                      "Dimensions of Input(X) and Out@Grad must be the same.");
+    auto mask_dims = ctx->GetInputDim("Mask");
+    PADDLE_ENFORCE_EQ(x_dims, mask_dims,
+                      "Dimensions of Input(X) and Mask must be the same.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker<float>, dropout_grad,
+            ops::DropoutOpGrad<float>);
+REGISTER_OP_CPU_KERNEL(
+    dropout,
+    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float, float>);
+REGISTER_OP_CPU_KERNEL(
+    dropout_grad,
+    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4ae9f4ce54d27dd1ad0312b5ad8d78a4cb904c79
--- /dev/null
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/fluid/operators/dropout_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, typename AttrType>
+struct MaskGenerator {
+  AttrType dropout_prob;
+  int seed;
+
+  __host__ __device__ MaskGenerator(AttrType dropout_prob, int seed)
+      : dropout_prob(dropout_prob), seed(seed) {}
+
+  inline __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed);
+    thrust::uniform_real_distribution<AttrType> dist(0, 1);
+    rng.discard(n);
+    if (dist(rng) < dropout_prob) {
+      return static_cast<T>(0);
+    }
+    return static_cast<T>(1);
+  }
+};
+
+// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename Place, typename T, typename AttrType>
+class GPUDropoutKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Output<Tensor>("Out");
+    y->mutable_data<T>(context.GetPlace());
+    AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
+
+    auto X = EigenMatrix<T>::Reshape(*x, 1);
+    auto Y = EigenMatrix<T>::Reshape(*y, 1);
+
+    auto& place = *context.template device_context<Place>().eigen_device();
+    if (!context.Attr<bool>("is_test")) {
+      auto* mask = context.Output<Tensor>("Mask");
+      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
+      int size = framework::product(mask->dims());
+
+      std::random_device rnd;
+      int seed =
+          context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
+
+      thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(mask_data),
+                        MaskGenerator<T, AttrType>(dropout_prob, seed));
+      auto M = EigenMatrix<T>::Reshape(*mask, 1);
+      Y.device(place) = X * M;
+    } else {
+      Y.device(place) = X * (1.0f - dropout_prob);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    dropout,
+    ops::GPUDropoutKernel<paddle::platform::CUDADeviceContext, float, float>);
+REGISTER_OP_CUDA_KERNEL(
+    dropout_grad,
+    ops::DropoutGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9dd1f33669ccf89202abe4a80bd9796411f630ba
--- /dev/null
+++ b/paddle/fluid/operators/dropout_op.h
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <random>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T, typename AttrType>
+class CPUDropoutKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Output<Tensor>("Out");
+    const auto* x_data = x->data<T>();
+    auto* y_data = y->mutable_data<T>(context.GetPlace());
+    float dropout_prob = context.Attr<float>("dropout_prob");
+
+    if (!context.Attr<bool>("is_test")) {
+      auto* mask = context.Output<Tensor>("Mask");
+      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
+
+      // NOTE: fixed seed should only be used in unittest or for debug.
+      // Guarantee to use random seed in training.
+      std::random_device rnd;
+      std::minstd_rand engine;
+      int seed =
+          context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
+      engine.seed(seed);
+
+      std::uniform_real_distribution<float> dist(0, 1);
+      size_t size = framework::product(mask->dims());
+      for (size_t i = 0; i < size; ++i) {
+        if (dist(engine) < dropout_prob) {
+          mask_data[i] = 0;
+          y_data[i] = 0;
+        } else {
+          mask_data[i] = 1;
+          y_data[i] = x_data[i];
+        }
+      }
+    } else {
+      auto X = EigenMatrix<T>::Reshape(*x, 1);
+      auto Y = EigenMatrix<T>::Reshape(*y, 1);
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      Y.device(place) = X * (1.0f - dropout_prob);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class DropoutGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(!context.Attr<bool>("is_test"),
+                   "GradOp is only callable when is_test is false");
+
+    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* mask = context.Input<Tensor>("Mask");
+    grad_x->mutable_data<T>(context.GetPlace());
+
+    auto M = EigenMatrix<T>::Reshape(*mask, 1);
+    auto dX = EigenMatrix<T>::Reshape(*grad_x, 1);
+    auto dY = EigenMatrix<T>::Reshape(*grad_y, 1);
+
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    dX.device(place) = dY * M;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/edit_distance_op.cc b/paddle/fluid/operators/edit_distance_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ae82408da71f9424a7a64dc9d3e42759707683b9
--- /dev/null
+++ b/paddle/fluid/operators/edit_distance_op.cc
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/edit_distance_op.h"
+
+namespace paddle {
+namespace operators {
+
+class EditDistanceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Hyps"), "Input(Hyps) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Refs"), "Input(Refs) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("SequenceNum"),
+                   "Output(SequenceNum) shouldn't be null.");
+    auto hyp_dims = ctx->GetInputDim("Hyps");
+    auto ref_dims = ctx->GetInputDim("Refs");
+    PADDLE_ENFORCE(hyp_dims.size() == 2 && hyp_dims[1] == 1,
+                   "Input(Hyps) must be a 2-D LoDTensor with the 2nd dimension "
+                   "equal to 1.");
+    PADDLE_ENFORCE(ref_dims.size() == 2 && ref_dims[1] == 1,
+                   "Input(Refs) must be a 2-D LoDTensor with the 2nd dimension "
+                   "equal to 1.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("Refs"));
+    ctx->SetOutputDim("SequenceNum", {1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(framework::proto::DataType::FP32,
+                                   ctx.device_context());
+  }
+};
+
+class EditDistanceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  EditDistanceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Hyps",
+             "(2-D LoDTensor<int64_t>, 2nd dim. equal to 1) "
+             "The indices for hypothesis strings.");
+    AddInput("Refs",
+             "(2-D LoDTensor<int64_t>, 2nd dim. equal to 1) "
+             "The indices for reference strings.");
+    AddOutput("SequenceNum", "The sequence count of current batch");
+    AddAttr<bool>("normalized",
+                  "(bool, default false) Indicated whether to normalize "
+                  "the edit distance by the length of reference string.")
+        .SetDefault(false);
+    AddOutput("Out",
+              "(2-D Tensor with shape [`batch_size` x 1]) "
+              "The output edit distances of EditDistance operator.");
+    AddComment(R"DOC(
+
+EditDistance operator computes the edit distances between a batch of hypothesis
+strings and their references.
+
+Edit distance, also called Levenshtein distance, measures how dissimilar two strings
+are by counting the minimum number of operations to transform one string into anthor.
+Here the operations include insertion, deletion, and substitution. For example,
+given hypothesis string A = "kitten" and reference B = "sitting", the edit distance
+is 3 for A will be transformed into B at least after two substitutions and one
+insertion:
+
+   "kitten" -> "sitten" -> "sittin" -> "sitting"
+
+Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with the total
+number denoted by `batch_size`, and the separation is specified by the LoD information.
+And the `batch_size` reference strings are arranged in order in the same way in the
+LoDTensor Input(Refs).
+
+Output(Out) contains the `batch_size` results and each stands for the edit stance
+for a pair of strings respectively. If Attr(normalized) is true, the edit distance
+will be divided by the length of reference string.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(edit_distance, ops::EditDistanceOp, ops::EditDistanceOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    edit_distance, ops::EditDistanceKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bdfead75e71752549f44a8b3c9b9e4501e8845a3
--- /dev/null
+++ b/paddle/fluid/operators/edit_distance_op.cu
@@ -0,0 +1,156 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T>
+__global__ void FillFirstRow(T* dist, const int N) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx < N + 1) {
+    dist[idx] = idx;
+  }
+}
+
+template <typename T>
+__global__ void FillFirstColumn(T* dist, const int M, const int N) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx < M + 1) {
+    dist[idx * (N + 1)] = idx;
+  }
+}
+
+template <typename T>
+__global__ void Levenshtein(T* dist, const int64_t* x1, const int64_t* x2,
+                            const int M, const int N, const int start) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int offset = N;
+  int index = start + idx * offset;
+  int row = index / (N + 1);
+  int col = index % (N + 1);
+  if (row > 0 && col > 0 && row < M + 1 && col < N + 1) {
+    int cost = x1[row - 1] == x2[col - 1] ? 0 : 1;
+    int dels = dist[(row - 1) * (N + 1) + col] + 1;
+    int ins = dist[row * (N + 1) + col - 1] + 1;
+    int subs = dist[(row - 1) * (N + 1) + (col - 1)] + cost;
+    dist[index] = min(dels, min(ins, subs));
+  }
+}
+
+template <typename T>
+__global__ void SetOutput(T* out, const T* dist, const int M, const int N,
+                          bool normalized) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx == 0) {
+    out[0] = normalized ? dist[M * (N + 1) + N] / N : dist[M * (N + 1) + N];
+  }
+}
+
+template <typename Place, typename T>
+class EditDistanceGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out_t = ctx.Output<framework::Tensor>("Out");
+
+    auto* x1_t = ctx.Input<framework::LoDTensor>("Hyps");
+    auto* x2_t = ctx.Input<framework::LoDTensor>("Refs");
+    auto* sequence_num = ctx.Output<framework::Tensor>("SequenceNum");
+    sequence_num->mutable_data<int64_t>(ctx.GetPlace());
+
+    auto normalized = ctx.Attr<bool>("normalized");
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+
+    auto hyp_lod = x1_t->lod()[0];
+    auto ref_lod = x2_t->lod()[0];
+    PADDLE_ENFORCE(
+        hyp_lod.size() == ref_lod.size(),
+        "Input(Hyps) and Input(Refs) must have the same batch size.");
+    for (size_t i = 1; i < ref_lod.size(); ++i) {
+      PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1],
+                     "Reference string %d is empty.", i);
+    }
+
+    const size_t num_strs = hyp_lod.size() - 1;
+    math::SetConstant<platform::CUDADeviceContext, int64_t> set_constant;
+    set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
+                 sequence_num, static_cast<int64_t>(num_strs));
+
+    out_t->Resize({static_cast<int64_t>(num_strs), 1});
+    out_t->mutable_data<T>(ctx.GetPlace());
+    auto out = out_t->data<T>();
+
+    T distance = 0.0;
+    for (size_t num = 0; num < num_strs; num++) {
+      auto m = static_cast<int64_t>(hyp_lod[num + 1] - hyp_lod[num]);
+      auto n = static_cast<int64_t>(ref_lod[num + 1] - ref_lod[num]);
+      if (m == 0 || n == 0) {
+        distance = std::max(m, n);
+        if (normalized) {
+          PADDLE_ENFORCE(n > 0,
+                         "The reference string (#%d) cannot be empty "
+                         "when Attr(normalized) is enabled.",
+                         n);
+          distance = distance / n;
+        }
+        memory::Copy(boost::get<Place>(ctx.GetPlace()), out + num,
+                     platform::CPUPlace(), &distance, sizeof(T), stream);
+      } else {
+        framework::Tensor dist_t;
+        dist_t.Resize({m + 1, n + 1});
+        dist_t.mutable_data<T>(ctx.GetPlace());
+        auto dist = dist_t.data<T>();
+        auto x1 = x1_t->data<int64_t>() + hyp_lod[num];
+        auto x2 = x2_t->data<int64_t>() + ref_lod[num];
+
+        FillFirstColumn<T><<<1 + m / PADDLE_CUDA_NUM_THREADS,
+                             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, m, n);
+
+        FillFirstRow<T><<<1 + n / PADDLE_CUDA_NUM_THREADS,
+                          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, n);
+        // Compute the elements of distance matrix in the anti-diagonal diretion
+        for (int64_t slice = 2; slice < m + n + 1; ++slice) {
+          int z_m = slice < m + 1 ? 0 : slice - m;
+          int z_n = slice < n + 1 ? 0 : slice - n;
+          int size = slice - (z_m + z_n) + 1;  // number of elments in the same
+                                               // anti-diagonal line to update
+          // the start index at which computes from
+          int start = slice < n + 1 ? slice : (z_n + 1) * (n + 1) - 1;
+          Levenshtein<T><<<1 + (size - 1) / PADDLE_CUDA_NUM_THREADS,
+                           PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, x1, x2,
+                                                                 m, n, start);
+        }
+        SetOutput<T><<<1, 1, 0, stream>>>(out + num, dist, m, n, normalized);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    edit_distance,
+    ops::EditDistanceGPUKernel<paddle::platform::CUDAPlace, float>);
diff --git a/paddle/fluid/operators/edit_distance_op.h b/paddle/fluid/operators/edit_distance_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..205e16e6bfe6b2d1678fca258ce1e70d29eff331
--- /dev/null
+++ b/paddle/fluid/operators/edit_distance_op.h
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class EditDistanceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out_t = ctx.Output<framework::Tensor>("Out");
+
+    auto* x1_t = ctx.Input<framework::LoDTensor>("Hyps");
+    auto* x2_t = ctx.Input<framework::LoDTensor>("Refs");
+    auto* sequence_num = ctx.Output<framework::Tensor>("SequenceNum");
+    int64_t* seq_num_data = sequence_num->mutable_data<int64_t>(ctx.GetPlace());
+
+    auto normalized = ctx.Attr<bool>("normalized");
+
+    auto hyp_lod = x1_t->lod()[0];
+    auto ref_lod = x2_t->lod()[0];
+    PADDLE_ENFORCE(
+        hyp_lod.size() == ref_lod.size(),
+        "Input(Hyps) and Input(Refs) must have the same batch size.");
+    for (size_t i = 1; i < ref_lod.size(); ++i) {
+      PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1],
+                     "Reference string %d is empty.", i);
+    }
+    auto num_strs = hyp_lod.size() - 1;
+    *seq_num_data = static_cast<int64_t>(num_strs);
+
+    out_t->Resize({static_cast<int64_t>(num_strs), 1});
+    out_t->mutable_data<float>(ctx.GetPlace());
+    auto out = out_t->data<T>();
+
+    T distance = 0.0;
+    for (size_t num = 0; num < num_strs; ++num) {
+      auto m = static_cast<int64_t>(hyp_lod[num + 1] - hyp_lod[num]);
+      auto n = static_cast<int64_t>(ref_lod[num + 1] - ref_lod[num]);
+
+      if (m == 0) {
+        distance = n;
+      } else if (n == 0) {
+        distance = m;
+      } else {
+        framework::Tensor dist_t;
+        dist_t.Resize({m + 1, n + 1});
+        dist_t.mutable_data<T>(ctx.GetPlace());
+        auto dist = dist_t.data<T>();
+        auto x1 = x1_t->data<int64_t>() + hyp_lod[num];
+        auto x2 = x2_t->data<int64_t>() + ref_lod[num];
+        for (int64_t i = 0; i < m + 1; ++i) {
+          dist[i * (n + 1)] = i;
+        }
+        for (int64_t j = 0; j < n + 1; ++j) {
+          dist[j] = j;
+        }
+        for (int64_t i = 1; i < m + 1; ++i) {
+          for (int64_t j = 1; j < n + 1; ++j) {
+            int cost = x1[i - 1] == x2[j - 1] ? 0 : 1;
+            int dels = dist[(i - 1) * (n + 1) + j] + 1;
+            int ins = dist[i * (n + 1) + (j - 1)] + 1;
+            int subs = dist[(i - 1) * (n + 1) + (j - 1)] + cost;
+            dist[i * (n + 1) + j] = std::min(dels, std::min(ins, subs));
+          }
+        }
+        distance = dist[m * (n + 1) + n];
+      }
+
+      if (normalized) {
+        PADDLE_ENFORCE(n > 0,
+                       "The reference string (#%d) cannot be empty "
+                       "when Attr(normalized) is enabled.",
+                       n);
+        distance = distance / n;
+      }
+      out[num] = distance;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise_add_op.cc b/paddle/fluid/operators/elementwise_add_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5b9947b8c935fd2b4739162cbd2f98dc965cec2a
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_add_op.cc
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseAddOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseAddOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Add", "Out = X + Y");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker,
+            elementwise_add_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_add,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_add_grad,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise_add_op.cu b/paddle/fluid/operators/elementwise_add_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2ac3a998ec46528ccb72fad1f5d73ce88992d995
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_add_op.cu
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/elementwise_add_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_add,
+    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_add_grad,
+    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/fluid/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..248e3b9d617fadd914f27ad02861e27078600b61
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_add_op.h
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/elementwise_op_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseAddKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          AddFunctor<T>(), z);
+  }
+};
+
+template <typename T>
+struct ElementwiseAddGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseAddBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e.reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseAddBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e.reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseAddGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseAddGradFunctor<T>,
+                           ElementwiseAddBroadCastGradFunctor<T>,
+                           ElementwiseAddBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise_div_op.cc b/paddle/fluid/operators/elementwise_div_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..818ae82f44ccd159b36944e67521c3b730214539
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_div_op.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise_div_op.h"
+#include "paddle/fluid/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseDivOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseDivOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Div", "Out = X / Y");
+    AddComment(comment_);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker,
+            elementwise_div_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_div,
+    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_div_grad,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise_div_op.cu b/paddle/fluid/operators/elementwise_div_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d1bb7a474c06f68d33412512a9eb99757634d18e
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_div_op.cu
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/elementwise_div_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_div,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_div_grad,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/fluid/operators/elementwise_div_op.h b/paddle/fluid/operators/elementwise_div_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e0726d9465fa988646f6f8dc74857a3bedf43e8
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_div_op.h
@@ -0,0 +1,139 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/elementwise_op_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct DivFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a / b; }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseDivKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          DivFunctor<T>(), z);
+  }
+};
+
+template <typename T>
+struct ElementwiseDivGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto z_e = framework::EigenVector<T>::Flatten(*z);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e / y_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = -1.0 * dz_e * z_e / y_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseDivBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e / y_e_bcast;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
+                           .reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseDivBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e / y_e_bcast;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
+                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseDivGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseDivGradFunctor<T>,
+                           ElementwiseDivBroadCastGradFunctor<T>,
+                           ElementwiseDivBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise_max_op.cc b/paddle/fluid/operators/elementwise_max_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1331bcadc8ce9a114d3dd7604273a3512e821e91
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_max_op.cc
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise_max_op.h"
+#include "paddle/fluid/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseMaxOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseMaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Max", "Out = max(X, Y)");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_max, ops::ElementwiseOp, ops::ElementwiseMaxOpMaker,
+            elementwise_max_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_max,
+    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_max_grad,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise_max_op.cu b/paddle/fluid/operators/elementwise_max_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7f0259ad0024c1a9b640f73ade5711b6eaa8f871
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_max_op.cu
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/elementwise_max_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_max,
+    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_max_grad,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/fluid/operators/elementwise_max_op.h b/paddle/fluid/operators/elementwise_max_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1db9bcc01104c3462b562b7a37b5817e867d7e4
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_max_op.h
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/elementwise_op_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct MaxFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a > b ? a : b; }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          MaxFunctor<T>(), z);
+  }
+};
+
+template <typename T>
+struct ElementwiseMaxGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = (x_e > y_e).template cast<T>() * dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (x_e <= y_e).template cast<T>() * dz_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseMaxBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = (x_e > y_e_bcast).template cast<T>() * dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = ((x_e <= y_e_bcast).template cast<T>() * dz_e)
+                           .reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseMaxBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = (x_e > y_e_bcast).template cast<T>() * dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = ((x_e <= y_e_bcast).template cast<T>() * dz_e)
+                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMaxGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseMaxGradFunctor<T>,
+                           ElementwiseMaxBroadCastGradFunctor<T>,
+                           ElementwiseMaxBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise_min_op.cc b/paddle/fluid/operators/elementwise_min_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d69099c8e6bd8e11a605758f553a9edd9cc322e
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_min_op.cc
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise_min_op.h"
+#include "paddle/fluid/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseMinOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseMinOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Max", "Out = min(X, Y)");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_min, ops::ElementwiseOp, ops::ElementwiseMinOpMaker,
+            elementwise_min_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_min,
+    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_min_grad,
+    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise_min_op.cu b/paddle/fluid/operators/elementwise_min_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ed53204735056477b5c59ce5082d377501409c65
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_min_op.cu
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/elementwise_min_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_min,
+    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_min_grad,
+    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/fluid/operators/elementwise_min_op.h b/paddle/fluid/operators/elementwise_min_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..bfe213dd4318aef8b1fb299ba40981d2feef8f9d
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_min_op.h
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/elementwise_op_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct MinFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a < b ? a : b; }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMinKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          MinFunctor<T>(), z);
+  }
+};
+
+template <typename T>
+struct ElementwiseMinGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = (x_e < y_e).template cast<T>() * dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (x_e >= y_e).template cast<T>() * dz_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseMinBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = (x_e < y_e_bcast).template cast<T>() * dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = ((x_e >= y_e_bcast).template cast<T>() * dz_e)
+                           .reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseMinBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = (x_e < y_e_bcast).template cast<T>() * dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = ((x_e >= y_e_bcast).template cast<T>() * dz_e)
+                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMinGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseMinGradFunctor<T>,
+                           ElementwiseMinBroadCastGradFunctor<T>,
+                           ElementwiseMinBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise_mul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0cb96f21d1b5049f9c7193f0a951e08986884c70
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise_mul_op.h"
+#include "paddle/fluid/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ElementwiseMulOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Mul", "Out = X \\odot\\ Y");
+    AddComment(comment_);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
+            elementwise_mul_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_mul,
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_mul_grad,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise_mul_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d72b6250eed24adbfc18e95a659a82b8e9916bc1
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_mul_op.cu
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/elementwise_mul_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_mul,
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_mul_grad,
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/fluid/operators/elementwise_mul_op.h b/paddle/fluid/operators/elementwise_mul_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc292eb1e7295780aacb0c34730044c3c8759cb7
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_mul_op.h
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/operators/elementwise_op_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct MulFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMulKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          MulFunctor<T>(), z);
+  }
+};
+
+template <typename T>
+struct ElementwiseMulGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e * y_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = x_e * dz_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseMulBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e * y_e_bcast;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (x_e * dz_e)
+                           .reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseMulBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e * y_e_bcast;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (x_e * dz_e)
+                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMulGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseMulGradFunctor<T>,
+                           ElementwiseMulBroadCastGradFunctor<T>,
+                           ElementwiseMulBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..38f83d7ad36d3cb6b42a283fec3c431b51747d4e
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -0,0 +1,137 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+class ElementwiseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  using Tensor = framework::Tensor;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of elementwise op should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of elementwise op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of elementwise op should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto y_dim = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
+                      "Rank of first input must >= rank of second input.");
+    ctx->SetOutputDim("Out", x_dim);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ElementwiseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor), The first input tensor of elementwise op.");
+    AddInput("Y", "(Tensor), The second input tensor of elementwise op.");
+    AddOutput("Out", "The output of elementwise op.");
+    AddAttr<int>("axis",
+                 "(int, default -1). The start dimension index "
+                 "for broadcasting Y onto X.")
+        .SetDefault(-1)
+        .EqualGreaterThan(-1);
+    comment_ = R"DOC(
+Limited Elementwise {name} Operator.
+
+The equation is:
+
+$${equation}$$
+
+$X$ is a tensor of any dimension and the dimensions of tensor $Y$ must be
+smaller than or equal to the dimensions of $X$.
+
+There are two cases for this operator:
+1. The shape of $Y$ is same with $X$;
+2. The shape of $Y$ is a subset of $X$.
+
+For case 2:
+$Y$ will be broadcasted to match the shape of $X$ and axis should be
+set to index of the start dimension to broadcast $Y$ onto $X$.
+
+For example
+  .. code-block:: python
+
+    shape(X) = (2, 3, 4, 5), shape(Y) = (,)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+    shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+
+Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details)
+information. However, the output only shares the LoD information with input $X$.
+
+)DOC";
+    AddComment(comment_);
+  }
+
+ protected:
+  std::string comment_;
+
+  void Replace(std::string& src, std::string from, std::string to) {
+    std::size_t len_from = std::strlen(from.c_str());
+    std::size_t len_to = std::strlen(to.c_str());
+    for (std::size_t pos = src.find(from); pos != std::string::npos;
+         pos = src.find(from, pos + len_to)) {
+      src.replace(pos, len_from, to);
+    }
+  }
+
+  void SetComment(std::string name, std::string equation) {
+    Replace(comment_, "{name}", name);
+    Replace(comment_, "{equation}", equation);
+  }
+};
+
+class ElementwiseOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  using Tensor = framework::Tensor;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                      "Rank of first input must >= rank of second input.");
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1269382a447d4f8d089c5fc392495d418123e48
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -0,0 +1,406 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/transform.h"
+
+#ifdef __NVCC__
+#include <thrust/iterator/iterator_adaptor.h>
+#endif
+
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+/*
+ * Out = X ⊙ Y
+ * If Y's shape does not match X' shape, they will be reshaped.
+ * For example:
+ * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+ *    pre=2, n=3*4, post=5
+ *    x.shape(2, 12, 5) * y.shape(1,12,1).broadcast(2,12,5)
+ * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
+ *    pre=2*3, n=4*5, post=1
+ *    x.shape(2, 3, 20) * y.shape(1,1,20).broadcast(2,3,20)
+ */
+inline void get_mid_dims(const framework::DDim& x_dims,
+                         const framework::DDim& y_dims, const int axis,
+                         int& pre, int& n, int& post) {
+  pre = 1;
+  n = 1;
+  post = 1;
+  for (int i = 0; i < axis; ++i) {
+    pre *= x_dims[i];
+  }
+
+  for (int i = 0; i < y_dims.size(); ++i) {
+    PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i],
+                      "Broadcast dimension mismatch.");
+    n *= y_dims[i];
+  }
+
+  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+    post *= x_dims[i];
+  }
+}
+
+template <typename T, typename DeviceContext>
+class RowwiseTransformIterator;
+template <typename T, typename DeviceContext>
+class MidWiseTransformIterator;
+
+template <typename T>
+class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
+ public:
+  RowwiseTransformIterator(const T* ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
+
+  RowwiseTransformIterator<T, platform::CPUDeviceContext>& operator++() {
+    ++i_;
+    if (UNLIKELY(i_ == n_)) {
+      i_ = 0;
+    }
+    return *this;
+  }
+
+  bool operator==(const RowwiseTransformIterator<T, platform::CPUDeviceContext>&
+                      rhs) const {
+    return (ptr_ + i_) == &(*rhs);
+  }
+
+  bool operator!=(const RowwiseTransformIterator<T, platform::CPUDeviceContext>&
+                      rhs) const {
+    return (ptr_ + i_) != &(*rhs);
+  }
+
+  const T& operator*() { return ptr_[i_]; }
+
+ private:
+  const T* ptr_;
+  int i_;
+  int64_t n_;
+};
+
+template <typename T>
+class MidWiseTransformIterator<T, platform::CPUDeviceContext> {
+ public:
+  MidWiseTransformIterator(const T* ptr, int n, int post)
+      : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
+
+  MidWiseTransformIterator<T, platform::CPUDeviceContext>& operator++() {
+    ++j_;
+    if (UNLIKELY(j_ == post_)) {
+      ++i_;
+      j_ = 0;
+      if (UNLIKELY(i_ == n_)) {
+        i_ = 0;
+      }
+    }
+    return *this;
+  }
+
+  bool operator==(const MidWiseTransformIterator<T, platform::CPUDeviceContext>&
+                      rhs) const {
+    return (ptr_ + i_) == &(*rhs);
+  }
+
+  bool operator!=(const MidWiseTransformIterator<T, platform::CPUDeviceContext>&
+                      rhs) const {
+    return (ptr_ + i_) != &(*rhs);
+  }
+
+  const T& operator*() { return ptr_[i_]; }
+
+ private:
+  const T* ptr_;
+  int64_t i_;
+  int64_t j_;
+  int64_t n_;
+  int64_t post_;
+};
+
+#ifdef __NVCC__
+template <typename T>
+class RowwiseTransformIterator<T, platform::CUDADeviceContext>
+    : public thrust::iterator_adaptor<
+          RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*> {
+ public:
+  typedef thrust::iterator_adaptor<
+      RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
+      super_t;
+  HOSTDEVICE RowwiseTransformIterator(const T* x, int n)
+      : super_t(x), begin_(x), n_(n){};
+  friend class thrust::iterator_core_access;
+
+ private:
+  unsigned int n_;
+  const T* begin_;
+  HOSTDEVICE typename super_t::reference dereference() const {
+    return *(begin_ + (this->base() - begin_) % n_);
+  }
+};
+
+template <typename T>
+class MidWiseTransformIterator<T, platform::CUDADeviceContext>
+    : public thrust::iterator_adaptor<
+          MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*> {
+ public:
+  typedef thrust::iterator_adaptor<
+      MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
+      super_t;
+  HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post)
+      : super_t(x), begin_(x), n_(n), post_(post){};
+  friend class thrust::iterator_core_access;
+
+ private:
+  unsigned int post_;
+  unsigned int n_;
+  const T* begin_;
+  HOSTDEVICE typename super_t::reference dereference() const {
+    return *(begin_ + (((this->base() - begin_) / post_) % n_));
+  }
+};
+#endif
+
+template <typename Functor, typename T, typename DeviceContext,
+          typename OutType = T>
+class TransformFunctor {
+ public:
+  TransformFunctor(const framework::Tensor* x, const framework::Tensor* y,
+                   framework::Tensor* z, const DeviceContext& ctx, Functor func)
+      : x_(x->data<T>()),
+        y_(y->data<T>()),
+        z_(z->mutable_data<OutType>(ctx.GetPlace())),
+        nx_(x->numel()),
+        ctx_(ctx),
+        func_(func) {}
+
+  inline void Run() const {
+    platform::Transform<DeviceContext> trans;
+    trans(ctx_, x_, x_ + nx_, y_, z_, func_);
+  }
+
+  inline void RunRowWise(int n, int pre) const {
+    platform::Transform<DeviceContext> trans;
+    trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator<T, DeviceContext>(y_, n),
+          z_, func_);
+  }
+
+  inline void RunMidWise(int n, int pre, int post) const {
+    platform::Transform<DeviceContext> trans;
+    trans(ctx_, x_, x_ + nx_,
+          MidWiseTransformIterator<T, DeviceContext>(y_, n, post), z_, func_);
+  }
+
+ private:
+  const T* x_;
+  const T* y_;
+  OutType* z_;
+  int64_t nx_;
+  const DeviceContext& ctx_;
+  Functor func_;
+};
+
+#define EIGEN_FUNCTOR(name, eigen_op)                                          \
+  struct Eigen##name##Functor {                                                \
+    template <typename DeviceContext, typename T>                              \
+    inline void Run(const framework::Tensor* x, const framework::Tensor* y,    \
+                    framework::Tensor* z,                                      \
+                    const framework::ExecutionContext& ctx) {                  \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      z_e.device(                                                              \
+          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
+          eigen_op(x_e, y_e);                                                  \
+    }                                                                          \
+    template <typename DeviceContext, typename T>                              \
+    inline void RunBroadCast(const framework::Tensor* x,                       \
+                             const framework::Tensor* y, framework::Tensor* z, \
+                             const framework::ExecutionContext& ctx, int pre,  \
+                             int n) {                                          \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))                  \
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))             \
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
+      z_e.device(                                                              \
+          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
+          eigen_op(x_e, y_bcast);                                              \
+    }                                                                          \
+    template <typename DeviceContext, typename T>                              \
+    inline void RunBroadCast2(const framework::Tensor* x,                      \
+                              const framework::Tensor* y,                      \
+                              framework::Tensor* z,                            \
+                              const framework::ExecutionContext& ctx, int pre, \
+                              int n, int post) {                               \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))               \
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))       \
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
+      z_e.device(                                                              \
+          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
+          eigen_op(x_e, y_bcast);                                              \
+    }                                                                          \
+  }
+
+template <class functor, typename DeviceContext, typename T>
+void ElementwiseCompute(const framework::ExecutionContext& ctx) {
+  using Tensor = framework::Tensor;
+
+  auto* x = ctx.Input<Tensor>("X");
+  auto* y = ctx.Input<Tensor>("Y");
+  auto* z = ctx.Output<Tensor>("Out");
+  z->mutable_data<T>(ctx.GetPlace());
+
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                    "Rank of first input must >= rank of second input.");
+
+  if (x_dims == y_dims) {
+    functor f;
+    f.template Run<DeviceContext, T>(x, y, z, ctx);
+    return;
+  }
+
+  int axis = ctx.Attr<int>("axis");
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                 "Axis should be in range [0, x_dims)");
+
+  int pre, n, post;
+  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+  if (post == 1) {
+    functor f;
+    f.template RunBroadCast<DeviceContext, T>(x, y, z, ctx, pre, n);
+    return;
+  } else {
+    functor f;
+    f.template RunBroadCast2<DeviceContext, T>(x, y, z, ctx, pre, n, post);
+    return;
+  }
+}
+
+#define EIGEN_ADD(x, y) ((x) + (y))
+EIGEN_FUNCTOR(Add, EIGEN_ADD);
+
+#define EIGEN_SUB(x, y) ((x) - (y))
+EIGEN_FUNCTOR(Sub, EIGEN_SUB);
+
+#define EIGEN_MUL(x, y) ((x) * (y))
+EIGEN_FUNCTOR(Mul, EIGEN_MUL);
+
+#define EIGEN_DIV(x, y) ((x) / (y))
+EIGEN_FUNCTOR(Div, EIGEN_DIV);
+
+template <typename DeviceContext, typename T, typename functor,
+          typename broadcastfunctor, typename broadcast2functor>
+void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
+
+                            const framework::Tensor* x,
+                            const framework::Tensor* y,
+                            const framework::Tensor* out,
+                            const framework::Tensor* dout, int axis,
+                            framework::Tensor* dx, framework::Tensor* dy) {
+  auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+
+  if (dx) {
+    dx->mutable_data<T>(ctx.GetPlace());
+  }
+  if (dy) {
+    dy->mutable_data<T>(ctx.GetPlace());
+  }
+
+  if (x_dims == y_dims) {
+    functor f;
+    f(place, x, y, out, dx, dy, dout);
+    return;
+  }
+
+  if (y_dims.size() == 1 && y_dims[0] == 1) {
+    // y is a scalar
+    auto extended_dims = framework::vectorize(x_dims);
+    extended_dims.push_back(1);
+    x_dims = framework::make_ddim(extended_dims);
+  }
+
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+
+  int pre, n, post;
+  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+
+  if (post == 1) {
+    broadcastfunctor f;
+    f(place, x, y, out, dx, dy, dout, pre, n);
+    return;
+  } else {
+    broadcast2functor f;
+    f(place, x, y, out, dx, dy, dout, pre, n, post);
+    return;
+  }
+}
+
+template <typename Functor, typename DeviceContext, typename T,
+          typename OutType = T>
+void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
+                          const framework::Tensor* x,
+                          const framework::Tensor* y, int axis, Functor func,
+                          framework::Tensor* z) {
+  TransformFunctor<Functor, T, DeviceContext, OutType> functor(
+      x, y, z, ctx.template device_context<DeviceContext>(), func);
+
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                    "Rank of first input must >= rank of second input.");
+
+  if (x_dims == y_dims) {
+    functor.Run();
+    return;
+  }
+
+  if (y_dims.size() == 1 && y_dims[0] == 1) {
+    // y is a scalar
+    auto extended_dims = framework::vectorize(x_dims);
+    extended_dims.push_back(1);
+    x_dims = framework::make_ddim(extended_dims);
+  }
+
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                 "Axis should be in range [0, x_dims)");
+
+  int pre, n, post;
+  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+  if (post == 1) {
+    functor.RunRowWise(n, pre);
+    return;
+  } else {
+    functor.RunMidWise(n, pre, post);
+    return;
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise_pow_op.cc b/paddle/fluid/operators/elementwise_pow_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..911b5dbd2501e6c5ef6177a23592fadeb3383002
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_pow_op.cc
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise_pow_op.h"
+#include "paddle/fluid/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwisePowOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwisePowOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Pow", "Out = X ^ Y");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(elementwise_pow, ops::ElementwiseOp,
+                             ops::ElementwisePowOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_pow,
+    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/elementwise_pow_op.cu b/paddle/fluid/operators/elementwise_pow_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2996600738fe11c3fef67c3f4c5660ff05e37957
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_pow_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/elementwise_pow_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_pow,
+    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/elementwise_pow_op.h b/paddle/fluid/operators/elementwise_pow_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b793c1eae0ec3a796897c7d81ac061f80ccffdb6
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_pow_op.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cmath>
+#include "paddle/fluid/operators/elementwise_op_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct PowFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return std::pow(a, b); }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwisePowKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          PowFunctor<T>(), z);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise_sub_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..46ce01c7cf5bf4930d05535e22f1d54073838071
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_sub_op.cc
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise_sub_op.h"
+#include "paddle/fluid/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseSubOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseSubOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Sub", "Out = X - Y");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker,
+            elementwise_sub_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_sub,
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_sub_grad,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise_sub_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..eb09d6c5edcb6e8460de71d76077fd103d799847
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_sub_op.cu
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/elementwise_sub_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_sub,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_sub_grad,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/fluid/operators/elementwise_sub_op.h b/paddle/fluid/operators/elementwise_sub_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..af2d497b9ae8f892aa272211ee2158d063d13909
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_sub_op.h
@@ -0,0 +1,121 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/operators/elementwise_op_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct SubFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseSubKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          SubFunctor<T>(), z);
+  }
+};
+
+template <typename T>
+struct ElementwiseSubGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) * dz_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseSubBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) *
+                       dz_e.reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseSubBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) *
+                       dz_e.reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseSubGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseSubGradFunctor<T>,
+                           ElementwiseSubBroadCastGradFunctor<T>,
+                           ElementwiseSubBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ccb9a94856fe868c8069510a7c557dfb8c22c369
--- /dev/null
+++ b/paddle/fluid/operators/expand_op.cc
@@ -0,0 +1,137 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/expand_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class ExpandOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+
+    std::vector<int> expand_times =
+        ctx->Attrs().Get<std::vector<int>>("expand_times");
+    auto x_dims = ctx->GetInputDim("X");
+
+    PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims.size()), expand_times.size(),
+                      "The number of Attr(expand_times)'s value must be equal "
+                      "to the rank of Input(X).");
+    PADDLE_ENFORCE_LE(x_dims.size(), 6,
+                      "The rank of Input(X) must not be greater than 6.");
+
+    std::vector<int64_t> out_shape(x_dims.size());
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      PADDLE_ENFORCE_GE(expand_times[i], 1,
+                        "Each value of Attr(expand_times) should not be "
+                        "less than 1.");
+      out_shape[i] = x_dims[i] * expand_times[i];
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
+    if (out_shape[0] == x_dims[0]) {
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+};
+
+class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+             "X is the input to be expanded.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+              "The rank of Output(Out) have the same with Input(X). "
+              "After expanding, size of each dimension of Output(Out) is equal "
+              "to size of the corresponding dimension of Input(X) multiplying "
+              "the corresponding value given by Attr(expand_times).");
+    AddAttr<std::vector<int>>("expand_times",
+                              "Expand times number for each dimension.");
+    AddComment(R"DOC(
+Expand operator tiles the input by given times number. You should set times
+number for each dimension by providing attribute 'expand_times'. The rank of X
+should be in [1, 6]. Please note that size of 'expand_times' must be the same
+with X's rank. Following is a using case:
+
+Input(X) is a 3-D tensor with shape [2, 3, 1]:
+
+        [
+           [[1], [2], [3]],
+           [[4], [5], [6]]
+        ]
+
+Attr(expand_times):  [1, 2, 2]
+
+Output(Out) is a 3-D tensor with shape [2, 6, 2]:
+
+        [
+            [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
+            [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
+        ]
+
+)DOC");
+  }
+};
+
+class ExpandGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    std::vector<int> expand_times =
+        ctx->Attrs().Get<std::vector<int>>("expand_times");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      PADDLE_ENFORCE_EQ(x_dims[i] * expand_times[i], out_dims[i],
+                        "Each dimension size of Input(Out@GRAD) should be "
+                        "equal to multiplication of crroresponding dimension "
+                        "size of Input(X) and Attr(expand_times) value.");
+    }
+
+    auto x_grad_name = framework::GradVarName("X");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad,
+            ops::ExpandGradOp);
+REGISTER_OP_CPU_KERNEL(
+    expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    expand_grad,
+    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/expand_op.cu b/paddle/fluid/operators/expand_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8a9f39708beec3e0d32d65245c63f8ccf9df8604
--- /dev/null
+++ b/paddle/fluid/operators/expand_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/fluid/operators/expand_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    expand, ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_grad,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8df1cd34d7dc5093b9bdcd3d015be4f9958d089d
--- /dev/null
+++ b/paddle/fluid/operators/expand_op.h
@@ -0,0 +1,174 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <boost/preprocessor/arithmetic/div.hpp>
+#include <boost/preprocessor/arithmetic/mod.hpp>
+#include <boost/preprocessor/comparison/greater.hpp>
+#include <boost/preprocessor/comparison/greater_equal.hpp>
+#include <boost/preprocessor/control/if.hpp>
+#include <boost/preprocessor/repetition/repeat.hpp>
+#include <iostream>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+#define EXPAND_TEMPLATE(z, n, data) \
+  case n + 1: {                     \
+    Expand<n + 1>(context);         \
+    break;                          \
+  }
+#define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~)
+#define COND(n)                                               \
+  BOOST_PP_GREATER_EQUAL(BOOST_PP_DIV(n, MAX_RANK_SUPPORTED), \
+                         BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
+#define EXPAND_GRAD_CASE(n)                                        \
+  case n: {                                                        \
+    ExpandBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                         \
+  }
+#define EXPAND_GRAD_TEMPLATE(z, n, data) \
+  BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), )
+#define REP_EXPAND_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_GRAD_TEMPLATE, ~)
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class ExpandKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    switch (rank) {
+      REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED)
+      default:
+        PADDLE_ENFORCE(false,
+                       "Only support tensor with rank being between 1 and 6.");
+    }
+  }
+
+ protected:
+  template <int Rank>
+  void Expand(const framework::ExecutionContext& context) const {
+    auto* in0 = context.Input<Tensor>("X");
+    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
+    auto* out0 = context.Output<Tensor>("Out");
+    Eigen::DSizes<int, Rank> bcast_dims;
+    auto x_dims = in0->dims();
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      bcast_dims[i] = expand_times[i];
+    }
+    auto x = EigenTensor<T, Rank>::From(*in0);
+    out0->mutable_data<T>(context.GetPlace());
+    auto y = EigenTensor<T, Rank>::From(*out0);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    y.device(place) = x.broadcast(bcast_dims);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ExpandGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
+    auto x_dims = in0->dims();
+    // 1. reshape_dims_vec is the broadcast parameter. For each dimension i,
+    //    if expand_times[i] > 1 and x_dims[i] > 1, i will be splitted to two
+    //    dimensions [expand_times[i], x_dims[i]].
+    // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
+    //    each dimension expanded, the gradients should be summed to original
+    //    size.
+    std::vector<int> reshape_dims_vec;
+    std::vector<int> reduce_dims_vec;
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      if (expand_times[i] == 1) {
+        reshape_dims_vec.push_back(x_dims[i]);
+      } else {
+        if (x_dims[i] == 1) {
+          reduce_dims_vec.push_back(reshape_dims_vec.size());
+          reshape_dims_vec.push_back(expand_times[i]);
+        } else {
+          reduce_dims_vec.push_back(reshape_dims_vec.size());
+          reshape_dims_vec.push_back(expand_times[i]);
+          reshape_dims_vec.push_back(x_dims[i]);
+        }
+      }
+    }
+
+    int dims = reshape_dims_vec.size() * MAX_RANK_SUPPORTED +
+               reduce_dims_vec.size() - MAX_RANK_SUPPORTED - 1;
+    // no need reduce, just copy
+    if (reduce_dims_vec.size() == 0) {
+      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+      out0->mutable_data<T>(context.GetPlace());
+      framework::Copy(*in0, context.GetPlace(), context.device_context(), out0);
+    } else {
+      switch (dims) {
+        REP_EXPAND_GRAD_TEMPLATE(72)
+        default:
+          PADDLE_ENFORCE(
+              false, "Only support tensor with rank being between 1 and 6.");
+      }
+    }
+  }
+
+ protected:
+  template <int Dims>
+  void ExpandBackward(const framework::ExecutionContext& context,
+                      const std::vector<int>& reshape_dims_vec,
+                      const std::vector<int>& reduce_dims_vec) const {
+    size_t reshape_size = Dims / MAX_RANK_SUPPORTED + 1;
+    size_t reduce_size = Dims % MAX_RANK_SUPPORTED + 1;
+    PADDLE_ENFORCE_EQ(reshape_size, reshape_dims_vec.size(),
+                      "Inconsistent size between template Dims and "
+                      "reshape dimensions.");
+    PADDLE_ENFORCE_EQ(reduce_size, reduce_dims_vec.size(),
+                      "Inconsistent size between template Dims and "
+                      "reduce dimensions.");
+    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    auto x = EigenVector<T>::Flatten(*(context.Input<Tensor>("X")));
+    out0->mutable_data<T>(context.GetPlace());
+    auto x_grad = EigenVector<T>::Flatten(*out0);
+    Eigen::DSizes<int, Dims / MAX_RANK_SUPPORTED + 1> reshape_dims;
+    for (size_t i = 0; i < reshape_size; ++i) {
+      reshape_dims[i] = reshape_dims_vec[i];
+    }
+    Eigen::DSizes<int, Dims % MAX_RANK_SUPPORTED + 1> reduce_dims;
+    for (size_t i = 0; i < reduce_size; ++i) {
+      reduce_dims[i] = reduce_dims_vec[i];
+    }
+    auto out_grad = EigenVector<T>::Flatten(*in0);
+    x_grad.device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
+        out_grad.reshape(reshape_dims).sum(reduce_dims).reshape(x.dimensions());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/feed_op.cc b/paddle/fluid/operators/feed_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b3f5f0d1d09a932e15936285f5cb226daa86e95
--- /dev/null
+++ b/paddle/fluid/operators/feed_op.cc
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+class FeedOp : public framework::OperatorBase {
+ public:
+  FeedOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto feed_var_name = Input("X");
+    auto *feed_var = scope.FindVar(feed_var_name);
+
+    PADDLE_ENFORCE(feed_var != nullptr,
+                   "Cannot find feed_var in scope, feed_var_name is %s",
+                   feed_var_name);
+
+    auto out_name = this->Output("Out");
+    auto *out_var = scope.FindVar(out_name);
+    PADDLE_ENFORCE(out_var != nullptr,
+                   "Cannot find out_var in scope, out_var_name is %s",
+                   out_name);
+
+    auto col = Attr<int>("col");
+
+    VLOG(3) << "Feed Var " << feed_var_name << "'s " << col << " column to var "
+            << out_name;
+
+    auto &feed_list = feed_var->Get<framework::FeedFetchList>();
+    auto &feed_item = feed_list.at(static_cast<size_t>(col));
+    auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    if (platform::is_same_place(feed_item.place(), place)) {
+      out_item->ShareDataWith(feed_item);
+    } else {
+      framework::Copy(feed_item, place, dev_ctx, out_item);
+    }
+    out_item->set_lod(feed_item.lod());
+  }
+};
+
+class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FeedOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of feed op");
+    AddOutput("Out", "The output of feed op");
+    AddAttr<int>("col", "(int) The column of feed");
+    AddComment(R"DOC(
+Feed Operator.
+
+It should not be configured by users directly.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(feed, paddle::operators::FeedOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::FeedOpInfoMaker);
diff --git a/paddle/fluid/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54e5892016cdb01f50189147a7453b868c5a48c0
--- /dev/null
+++ b/paddle/fluid/operators/fetch_op.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+class FetchOp : public framework::OperatorBase {
+ public:
+  FetchOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto fetch_var_name = Input("X");
+    auto *fetch_var = scope.FindVar(fetch_var_name);
+    PADDLE_ENFORCE(fetch_var != nullptr,
+                   "Cannot find fetch variable in scope, fetch_var_name is %s",
+                   fetch_var_name);
+
+    auto out_name = this->Output("Out");
+    auto *out_var = scope.FindVar(out_name);
+    PADDLE_ENFORCE(out_var != nullptr,
+                   "Cannot find out_var in scope, out_var_name is %s",
+                   out_name);
+
+    auto col = static_cast<size_t>(Attr<int>("col"));
+
+    auto *fetch_list = out_var->GetMutable<framework::FeedFetchList>();
+    auto &src_item = fetch_var->Get<framework::FeedFetchType>();
+
+    if (col >= fetch_list->size()) {
+      fetch_list->resize(col + 1);
+    }
+    auto &dst_item = fetch_list->at(col);
+
+    // FIXME(yuyang18): Should we assume the fetch operator always generate
+    // CPU outputs?
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(src_item.place());
+
+    Copy(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
+    dev_ctx.Wait();
+    dst_item.set_lod(src_item.lod());
+
+    VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name;
+  }
+};
+
+class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FetchOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of fetch op");
+    AddOutput("Out", "The output of fetch op");
+    AddAttr<int>("col", "(int) The column of fetch");
+    AddComment(R"DOC(
+Fetch Operator.
+
+It should not be configured by users directly.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(fetch, paddle::operators::FetchOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::FetchOpInfoMaker);
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e6992ba371c1dd61f7f6fa293be586818350fb3f
--- /dev/null
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fill_constant_batch_size_like_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("Input"),
+        "Input(Input) of FillConstantBatchSizeLikeOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of FillConstantBatchSizeLikeOp should not be null.");
+
+    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    PADDLE_ENFORCE_GT(shape.size(), 0);
+    std::vector<int64_t> shape_int64(shape.size(), 0);
+    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
+                   [](int a) { return static_cast<int64_t>(a); });
+    auto output_dim = framework::make_ddim(shape_int64);
+
+    int input_dim_idx = ctx->Attrs().Get<int>("input_dim_idx");
+    PADDLE_ENFORCE_GE(input_dim_idx, 0);
+    PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx);
+
+    int output_dim_idx = ctx->Attrs().Get<int>("output_dim_idx");
+    PADDLE_ENFORCE_GE(output_dim_idx, 0);
+    PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), output_dim_idx);
+
+    output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx];
+    ctx->SetOutputDim("Out", output_dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
+        ctx.device_context());
+  }
+};
+
+class FillConstantBatchSizeLikeOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  FillConstantBatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<int>("dtype",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
+        .SetDefault(framework::proto::DataType::FP32);
+    AddInput("Input",
+             "(Tensor) Tensor "
+             "whose dim_idx th dimension is used to specify the batch_size");
+    AddOutput("Out",
+              "(Tensor) Tensor of specified shape will be filled "
+              "with the specified value");
+    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
+    AddAttr<int>("input_dim_idx",
+                 "(int, default 0) The index of input's batch size dimension")
+        .SetDefault(0);
+    AddAttr<int>("output_dim_idx",
+                 "(int, default 0) The index of output's batch size dimension")
+        .SetDefault(0);
+    AddAttr<float>("value", "(float, default 0) The value to be filled")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+FillConstantBatchSizeLike Operator.
+
+Fill up a variable with specified constant value.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fill_constant_batch_size_like,
+                  ops::FillConstantBatchSizeLikeOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::FillConstantBatchSizeLikeOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    fill_constant_batch_size_like,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
+                                           float>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
+                                           double>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
+                                           int>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
+                                           int64_t>);
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b4f4d2a50305e2582f23ceed931d655f9690e110
--- /dev/null
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fill_constant_batch_size_like_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    fill_constant_batch_size_like,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+                                           float>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+                                           double>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+                                           int>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+                                           int64_t>);
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.h b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..da4a20d99a13533019d57fca42b1b49780200b79
--- /dev/null
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+    auto value = ctx.Attr<float>("value");
+
+    math::SetConstant<DeviceContext, T> setter;
+    setter(ctx.template device_context<DeviceContext>(), out,
+           static_cast<T>(value));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d4bf6406e5716a6b65a234d1cd642b64dcc5726f
--- /dev/null
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+class FillConstantInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FillConstantOp should not be null.");
+    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    ctx->SetOutputDim("Out", framework::make_ddim(shape));
+  }
+};
+
+class FillConstantOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    auto data_type =
+        static_cast<framework::proto::DataType>(Attr<int>("dtype"));
+    auto value = Attr<float>("value");
+    auto force_cpu = Attr<bool>("force_cpu");
+    auto &out =
+        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+    if (force_cpu) {
+      auto cpu = platform::CPUPlace();
+      out.mutable_data(cpu, framework::ToTypeIndex(data_type));
+    } else {
+      out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
+    }
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    math::set_constant(dev_ctx, &out, value);
+  }
+};
+
+class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FillConstantOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<int>("dtype",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
+        .SetDefault(framework::proto::DataType::FP32);
+    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
+    AddAttr<float>("value", "(float, default 0) The value to be filled")
+        .SetDefault(0.0f);
+    AddAttr<bool>("force_cpu",
+                  "(bool, default false) Force fill output variable to cpu "
+                  "memory. Otherwise, fill output variable to the running "
+                  "device")
+        .SetDefault(false);
+    AddOutput("Out",
+              "(Tensor) Tensor of specified shape will be filled "
+              "with the specified value");
+    AddComment(R"DOC(
+FillConstantBatchSizeLike Operator.
+
+Fill up a variable with specified constant value.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
+                  ops::FillConstantInferShape, ops::FillConstantOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e318f37cf0bc945597b5aa7b384e53038c97786
--- /dev/null
+++ b/paddle/fluid/operators/fill_op.cc
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+struct FillOpVisitor {
+  FillOpVisitor(framework::LoDTensor *tensor, const std::vector<float> &value)
+      : tensor_(tensor), value_(value) {}
+
+  template <typename T>
+  void operator()() const {
+    platform::CPUPlace cpu;
+    auto *data = tensor_->mutable_data<T>(cpu);
+    std::transform(value_.data(), value_.data() + tensor_->numel(), data,
+                   [](float dat) { return static_cast<T>(dat); });
+  }
+
+  framework::LoDTensor *tensor_;
+  const std::vector<float> &value_;
+};
+
+class FillOp : public framework::OperatorBase {
+ public:
+  FillOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto &out =
+        detail::Ref(detail::Ref(scope.FindVar(Output("Out")),
+                                "Cannot find variable %s", Output("Out"))
+                        .GetMutable<framework::LoDTensor>());
+    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+    auto dtype = static_cast<framework::proto::DataType>(Attr<int>("dtype"));
+    platform::CPUPlace cpu;
+    auto force_cpu = Attr<bool>("force_cpu");
+    out.mutable_data(force_cpu ? cpu : place, framework::ToTypeIndex(dtype));
+
+    framework::LoDTensor tensor;
+
+    if (force_cpu || platform::is_cpu_place(place)) {
+      tensor.ShareDataWith(out);
+    } else {
+      // Always make tensor in CPU memory.
+      tensor.Resize(out.dims());
+      tensor.mutable_data(cpu, framework::ToTypeIndex(dtype));
+    }
+
+    framework::VisitDataType(
+        dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));
+
+    if (!force_cpu && platform::is_gpu_place(place)) {
+      // Copy tensor to out
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
+      framework::Copy(tensor, place, dev_ctx, &out);
+    }
+  }
+};
+
+class FillOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FillOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddComment(R"DOC(Fill operator
+
+Fill an tensor with `value` and `shape`. The type of the tensor is specify by
+`dtype`.
+)DOC");
+    AddOutput("Out", "(LoDTensor) The output tensor.");
+    AddAttr<std::vector<float>>(
+        "value", "The float values of tensor, which are flatten in row major");
+    AddAttr<std::vector<int>>("shape", "The shape of output tensor");
+    AddAttr<int>("dtype", "The data type of output tensor, Default is float")
+        .SetDefault(framework::proto::DataType::FP32);
+    AddAttr<bool>("force_cpu",
+                  "Whether the output tensor must be at CPU memory or not. "
+                  "Default is false.")
+        .SetDefault(false);
+  }
+};
+
+class FillOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    context->SetOutputDim(
+        "Out",
+        framework::make_ddim(context->Attrs().Get<std::vector<int>>("shape")));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fill, ops::FillOp, ops::FillOpInferShape, ops::FillOpMaker);
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cc b/paddle/fluid/operators/fill_zeros_like_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..958bfb1557d9fa39534caef594818aa97bbe03a6
--- /dev/null
+++ b/paddle/fluid/operators/fill_zeros_like_op.cc
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fill_zeros_like_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FillZerosLikeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of FillZerosLikeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FillZerosLikeOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FillZerosLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of fill-zeros-like op.");
+    AddOutput("Out", "The variable will be filled up with zeros.");
+    AddComment(R"DOC(
+FillZerosLike Operator.
+
+Fill up a variable with zeros.
+The output will have the same size as the input.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp,
+                             ops::FillZerosLikeOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    fill_zeros_like,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>);
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..07078573d8aaa1d72f876f4be68ff70d8a56d8a1
--- /dev/null
+++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fill_zeros_like_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    fill_zeros_like,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
diff --git a/paddle/fluid/operators/fill_zeros_like_op.h b/paddle/fluid/operators/fill_zeros_like_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..141c3809e9aa3e2984bf802418f8ddf7d92fa446
--- /dev/null
+++ b/paddle/fluid/operators/fill_zeros_like_op.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class FillZerosLikeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+
+    math::SetConstant<DeviceContext, T> setter;
+    setter(context.template device_context<DeviceContext>(), out,
+           static_cast<T>(0));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/ftrl_op.cc b/paddle/fluid/operators/ftrl_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e72a173751e9b163b6083df474c2b46c76ed459d
--- /dev/null
+++ b/paddle/fluid/operators/ftrl_op.cc
@@ -0,0 +1,139 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/ftrl_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FTRLOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("SquaredAccumulator"),
+                   "Input(SquaredAccumulator) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LinearAccumulator"),
+                   "Input(LinearAccumulator) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of FTRL should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("SquaredAccumOut"),
+                   "Output(SquaredAccumOut) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("LinearAccumOut"),
+                   "Output(LinearAccumOut) of FTRL should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Grad"),
+                      "Two input of FTRL Op's dimension must be same.");
+
+    auto lr_dim = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
+                      "Learning Rate should be a scalar.");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("SquaredAccumOut", param_dim);
+    ctx->SetOutputDim("LinearAccumOut", param_dim);
+  }
+};
+
+class FTRLOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FTRLOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param",
+             "(Tensor, default Tensor<float>) "
+             "Input parameter value that has to be updated.");
+    AddInput("SquaredAccumulator",
+             "(Tensor, default Tensor<float>) "
+             "Accumulator that accumulates squared gradients.");
+    AddInput("LinearAccumulator",
+             "(Tensor, default Tensor<float>) "
+             "Accumulator that accumulates linear gradients.");
+    AddInput("Grad",
+             "(Tensor, default Tensor<float>) "
+             "Input gradient of the parameter.");
+    AddInput("LearningRate",
+             "(Tensor, default Tensor<float>) "
+             "The learning rate should be a tensor of size 1.");
+
+    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
+    AddOutput("SquaredAccumOut",
+              "(Tensor) Output accumulated squared"
+              " gradients.");
+    AddOutput("LinearAccumOut",
+              "(Tensor) Output accumulated linear"
+              " gradients.");
+
+    AddAttr<float>("l1",
+                   "(float, default 0.0) "
+                   "L1 regularization strength.")
+        .SetDefault(0.0f);
+    AddAttr<float>("l2",
+                   "(float, default 0.0) "
+                   "L2 regularization strength.")
+        .SetDefault(0.0f);
+    AddAttr<float>("lr_power",
+                   "(float, default -0.5f) "
+                   "Learning Rate Power.")
+        .SetDefault(-0.5f);
+    AddComment(R"DOC(
+FTRL (Follow The Regularized Leader) Operator.
+
+Optimizer that implements the FTRL algorithm:
+
+$$
+new\_accum = squared\_accum + grad^2 \\
+if (lr\_power == -0.5) {
+   linear\_accum += grad - (\surd(new\_accum) - \surd(squared\_accum)) /
+                   (learning\_rate * param) \\
+} else {
+   linear\_accum += grad -
+                  (new\_accum^{-lr\_power} - accum^{-lr\_power}) /
+                  (learning\_rate * param) \\
+}
+
+x = (l1 * sign(linear\_accum) - linear\_accum)
+if (lr\_power == -0.5) {
+   y = \frac{\surd(new\_accum)}{learning\_rate} + (2 * l2) \\
+   pre\_shrink = \frac{x}{y} \\
+   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) \\
+} else {
+   y = \frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2) \\
+   pre\_shrink = \frac{x}{y} \\
+   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) \\
+}
+squared\_accum += grad^2;
+$$
+
+The paper that proposed Follow The Regularized Leader (FTRL):
+(https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(ftrl, ops::FTRLOp, ops::FTRLOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    ftrl, ops::FTRLOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/ftrl_op.cu b/paddle/fluid/operators/ftrl_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..dbdfcb927e0aff373a716c5e0eace96bec38e9ad
--- /dev/null
+++ b/paddle/fluid/operators/ftrl_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+You may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/ftrl_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    ftrl, ops::FTRLOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/ftrl_op.h b/paddle/fluid/operators/ftrl_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a9405fcef1fa405ab14ba7c797b99c2259892f7
--- /dev/null
+++ b/paddle/fluid/operators/ftrl_op.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class FTRLOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param_out = ctx.Output<Tensor>("ParamOut");
+    auto* sq_accum_out = ctx.Output<Tensor>("SquaredAccumOut");
+    auto* lin_accum_out = ctx.Output<Tensor>("LinearAccumOut");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+    sq_accum_out->mutable_data<T>(ctx.GetPlace());
+    lin_accum_out->mutable_data<T>(ctx.GetPlace());
+
+    auto grad = ctx.Input<Tensor>("Grad");
+
+    auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
+    auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
+    auto lr_power = static_cast<T>(ctx.Attr<float>("lr_power"));
+
+    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
+    auto sq_accum =
+        EigenVector<T>::Flatten(*ctx.Input<Tensor>("SquaredAccumulator"));
+    auto lin_accum =
+        EigenVector<T>::Flatten(*ctx.Input<Tensor>("LinearAccumulator"));
+    auto g = EigenVector<T>::Flatten(*grad);
+    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
+
+    auto p_out = EigenVector<T>::Flatten(*param_out);
+    auto s_acc_out = EigenVector<T>::Flatten(*sq_accum_out);
+    auto l_acc_out = EigenVector<T>::Flatten(*lin_accum_out);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+
+    auto new_accum = sq_accum + g * g;
+    // Special case for lr_power = -0.5
+    if (lr_power == static_cast<T>(-0.5)) {
+      l_acc_out.device(place) =
+          lin_accum + g -
+          ((new_accum.sqrt() - sq_accum.sqrt()) / lr.broadcast(grad_dsize)) * p;
+    } else {
+      l_acc_out.device(place) =
+          lin_accum + g -
+          ((new_accum.pow(-lr_power) - sq_accum.pow(-lr_power)) /
+           lr.broadcast(grad_dsize)) *
+              p;
+    }
+
+    auto x = (l_acc_out.constant(l1) * l_acc_out.sign() - l_acc_out);
+    if (lr_power == static_cast<T>(-0.5)) {
+      auto y = (new_accum.sqrt() / lr.broadcast(grad_dsize)) +
+               l_acc_out.constant(static_cast<T>(2) * l2);
+      auto pre_shrink = x / y;
+      p_out.device(place) =
+          (l_acc_out.abs() > l_acc_out.constant(l1))
+              .select(pre_shrink, p.constant(static_cast<T>(0)));
+    } else {
+      auto y = (new_accum.pow(-lr_power) / lr.broadcast(grad_dsize)) +
+               l_acc_out.constant(static_cast<T>(2) * l2);
+      auto pre_shrink = x / y;
+      p_out.device(place) =
+          (l_acc_out.abs() > l_acc_out.constant(l1))
+              .select(pre_shrink, p.constant(static_cast<T>(0)));
+    }
+
+    s_acc_out.device(place) = sq_accum + g * g;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..af5898e29ecaed5a4d2cf8372a3bb20f192fc776
--- /dev/null
+++ b/paddle/fluid/operators/gather.cu.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using platform::DeviceContext;
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void GatherCUDAKernel(const T* params, const int* indices, T* output,
+                                 size_t index_size, size_t slice_size) {
+  CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
+    int indices_i = i / slice_size;
+    int slice_i = i - indices_i * slice_size;  // offset inside the slice
+    int gather_i = indices[indices_i];
+    int params_i = gather_i * slice_size + slice_i;
+    *(output + i) = *(params + params_i);
+  }
+}
+
+/**
+ * A thin wrapper on gpu tensor
+ * Return a new tensor from source tensor, gathered according to index
+ * input[src]: type-T source Tensor
+ * input[index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T>
+void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
+               const Tensor& index, Tensor* output) {
+  // PADDLE_ENFORCE(platform::is_gpu_place(place));
+  // check index of shape 1-D
+  PADDLE_ENFORCE(index.dims().size() == 1);
+  int index_size = index.dims()[0];
+
+  auto src_dims = src.dims();
+  framework::DDim output_dims(src_dims);
+  output_dims[0] = index_size;
+
+  // slice size
+  int slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  const T* p_src = src.data<T>();
+  const int* p_index = index.data<int>();
+  T* p_output = output->data<T>();
+
+  int block = 512;
+  int n = slice_size * index_size;
+  int grid = (n + block - 1) / block;
+
+  GatherCUDAKernel<T><<<
+      grid, block, 0,
+      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+      p_src, p_index, p_output, index_size, slice_size);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h
new file mode 100644
index 0000000000000000000000000000000000000000..287732eeb6e5249f631bc3e39cd18bc050f9fc3b
--- /dev/null
+++ b/paddle/fluid/operators/gather.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <memory.h>
+#include <cstring>
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+/**
+ * A thin wrapper for gathering on cpu tensor
+ * Return a new tensor from source tensor, gathered according to index
+ * input[src]: type-T source Tensor
+ * input[index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T>
+void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
+               const Tensor& index, Tensor* output) {
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
+  // check index of shape 1-D
+  PADDLE_ENFORCE(index.dims().size() == 1);
+  int index_size = index.dims()[0];
+
+  auto src_dims = src.dims();
+  framework::DDim output_dims(src_dims);
+  output_dims[0] = index_size;
+
+  const T* p_src = src.data<T>();
+  const int* p_index = index.data<int>();
+  T* p_output = output->data<T>();
+
+  // slice size
+  int slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  const size_t slice_bytes = slice_size * sizeof(T);
+
+  for (int i = 0; i < index_size; ++i) {
+    int index_ = p_index[i];
+    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dceeb71ee3552bcb462014b5f08a59d4406497ad
--- /dev/null
+++ b/paddle/fluid/operators/gather_op.cc
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/gather_op.h"
+#include "paddle/fluid/framework/ddim.h"
+
+namespace paddle {
+namespace operators {
+
+class GatherOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of GatherOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Index"),
+                   "Input(Index) of GatherOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of GatherOp should not be null.");
+
+    auto index_dims = ctx->GetInputDim("Index");
+    PADDLE_ENFORCE(index_dims.size() == 1);
+    int batch_size = ctx->GetInputDim("Index")[0];
+    PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
+    framework::DDim output_dims(ctx->GetInputDim("X"));
+    output_dims[0] = batch_size;
+    ctx->SetOutputDim("Out", output_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class GatherGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GatherOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The source input of gather op");
+    AddInput("Index", "The index input of gather op");
+    AddOutput("Out", "The output of gather op");
+    AddComment(R"DOC(
+Gather Operator.
+
+$Out = X[Index]$
+
+Out is obtained by gathering entries of the outer-most dimension 
+of X indexed by Index and concatenate them together.
+
+Example:
+
+X = [[1, 2],
+     [3, 4],
+     [5, 6]]
+
+Index = [[1, 2]]
+
+Then:
+
+Out = [[3, 4],
+       [5, 6]]
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
+            ops::GatherGradOp);
+REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>);
+REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>);
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..484f4232624e862aff7c0aff337b4e5df65d5be3
--- /dev/null
+++ b/paddle/fluid/operators/gather_op.cu
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gather.cu.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/gather_op.h"
+#include "scatter.cu.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class GatherOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *output = ctx.Output<Tensor>("Out");
+
+    output->mutable_data<T>(ctx.GetPlace());
+
+    GPUGather<T>(ctx.device_context(), *x, *index, output);
+  }
+};
+
+template <typename T>
+class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *x = ctx.Input<Tensor>("X");
+
+    dX->mutable_data<T>(ctx.GetPlace());
+    auto dxt = framework::EigenVector<T>::Flatten(*dX);
+    auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
+                       .eigen_device();
+    dxt.device(place) = dxt.constant(static_cast<T>(0));
+
+    GPUScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>);
diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ba4a31c81be025978c6c2a325792eea2eb353a7
--- /dev/null
+++ b/paddle/fluid/operators/gather_op.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "gather.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "scatter.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class GatherOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
+
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *output = ctx.Output<Tensor>("Out");
+
+    output->mutable_data<T>(ctx.GetPlace());
+
+    CPUGather<T>(ctx.device_context(), *x, *index, output);
+  }
+};
+
+template <typename T>
+class GatherGradientOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
+
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    dX->mutable_data<T>(ctx.GetPlace());
+    auto dxt = framework::EigenVector<T>::Flatten(*dX);
+    auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
+    dxt.device(place) = dxt.constant(static_cast<T>(0));
+
+    ScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4d86cf5ce334705d16435f542cc33be454edabb7
--- /dev/null
+++ b/paddle/fluid/operators/gather_test.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <string>
+
+TEST(Gather, GatherData) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators;
+
+  Tensor* src = new Tensor();
+  Tensor* index = new Tensor();
+  Tensor* output = new Tensor();
+
+  int* p_src = nullptr;
+  int* p_index = nullptr;
+  p_src = src->mutable_data<int>(make_ddim({3, 4}), CPUPlace());
+  p_index = index->mutable_data<int>(make_ddim({2}), CPUPlace());
+
+  for (int i = 0; i < 12; ++i) p_src[i] = i;
+  p_index[0] = 1;
+  p_index[1] = 0;
+
+  int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
+
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  paddle::platform::CPUDeviceContext ctx(*cpu_place);
+  CPUGather<int>(ctx, *src, *index, output);
+
+  for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
+  for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
+
+  delete src;
+  delete index;
+  delete output;
+}
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b090f8759765039eadfc900361bcdabe215c2225
--- /dev/null
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <random>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CPUGaussianRandomKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    float mean = context.Attr<float>("mean");
+    float std = context.Attr<float>("std");
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::normal_distribution<T> dist(mean, std);
+    int64_t size = tensor->numel();
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(engine);
+    }
+  }
+};
+
+class GaussianRandomOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of GaussianRandomOp should not be null.");
+    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    std::vector<int64_t> temp;
+    temp.reserve(shape.size());
+    for (auto dim : shape) {
+      temp.push_back(static_cast<int64_t>(dim));
+    }
+    PADDLE_ENFORCE(shape.size() > 0UL,
+                   "shape can be one int or array. shape must be set.");
+    ctx->SetOutputDim("Out", framework::make_ddim(temp));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
+        ctx.device_context());
+  }
+};
+
+class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GaussianRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "Output matrix of gaussian random op");
+
+    AddAttr<std::vector<int>>("shape",
+                              "(vector<int>) "
+                              "The dimension of random tensor.");
+    AddAttr<float>("mean",
+                   "(float, default 0.0) "
+                   "mean of random tensor.")
+        .SetDefault(.0f);
+    AddAttr<float>("std",
+                   "(float, default 1.0) "
+                   "std of random tensor.")
+        .SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "(int, default 0) "
+                 "Random seed of generator."
+                 "0 means use system wide seed.")
+        .SetDefault(0);
+    AddAttr<int>("dtype",
+                 "(int, default 5(FP32)) "
+                 "Output data type.")
+        .SetDefault(framework::proto::DataType::FP32);
+
+    AddComment(R"DOC(
+GaussianRandom Operator.
+
+Used to initialize tensors with gaussian random generator.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp,
+                             ops::GaussianRandomOpMaker);
+REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel<float>);
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..70d655d4bb259bf33765fa42e46a19510ffca35d
--- /dev/null
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct GaussianGenerator {
+  T mean_, std_;
+  unsigned int seed_;
+
+  __host__ __device__ GaussianGenerator(T mean, T std, int seed)
+      : mean_(mean), std_(std), seed_(seed) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::normal_distribution<T> dist(mean_, std_);
+    rng.discard(n);
+    return dist(rng);
+  }
+};
+
+template <typename T>
+class GPUGaussianRandomKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    if (seed == 0) {
+      std::random_device rd;
+      seed = rd();
+    }
+    T mean = static_cast<T>(context.Attr<float>("mean"));
+    T std = static_cast<T>(context.Attr<float>("std"));
+    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    int64_t size = tensor->numel();
+    thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                      thrust::device_ptr<T>(data),
+                      GaussianGenerator<T>(mean, std, seed));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(gaussian_random,
+                        paddle::operators::GPUGaussianRandomKernel<float>);
diff --git a/paddle/fluid/operators/get_places_op.cc b/paddle/fluid/operators/get_places_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ba908e472bbc165a244d8543713f1dbf293abb48
--- /dev/null
+++ b/paddle/fluid/operators/get_places_op.cc
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thread>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/place.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/gpu_info.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+static size_t CUDADevCount() {
+#ifdef PADDLE_WITH_CUDA
+  return platform::GetCUDADeviceCount();
+#else
+  return 0UL;
+#endif
+}
+
+class GetPlacesOp : public framework::OperatorBase {
+ public:
+  GetPlacesOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    bool is_gpu;
+    if (Attr<std::string>("device_type") == "AUTO") {
+      is_gpu = platform::is_gpu_place(place);
+    } else {
+      is_gpu = Attr<std::string>("device_type") == "CUDA";
+    }
+    auto device_count = static_cast<size_t>(Attr<int>("device_count"));
+    if (device_count == 0) {
+      device_count =
+          is_gpu ? CUDADevCount() : std::thread::hardware_concurrency();
+    }
+    PADDLE_ENFORCE_NE(device_count, 0, "Cannot indicate %s device count",
+                      is_gpu ? "GPU" : "CPU");
+
+    auto out_var_name = Output("Out");
+    auto &places =
+        *(detail::Ref(scope.FindVar(out_var_name),
+                      "Output variable %s cannot be found", out_var_name)
+              .GetMutable<platform::PlaceList>());
+    places.reserve(device_count);
+    if (is_gpu) {
+      PADDLE_ENFORCE_LE(device_count, CUDADevCount(),
+                        "Only %d CUDA devices found, cannot set to %d",
+                        CUDADevCount(), device_count);
+      for (size_t i = 0; i < device_count; ++i) {
+        places.emplace_back(platform::CUDAPlace(static_cast<int>(i)));
+      }
+    } else {
+      for (size_t i = 0; i < device_count; ++i) {
+        places.emplace_back(platform::CPUPlace());
+      }
+    }
+  }
+};
+
+class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GetPlacesOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "vector of Place");
+    AddAttr<int>("device_count", "device count").SetDefault(0);
+    AddAttr<std::string>("device_type", "device type")
+        .InEnum({"CUDA", "CPU", "AUTO"})
+        .SetDefault("AUTO");
+    AddComment(R"DOC(
+Returns a list of places based on flags. The list will be used for parallel
+execution.
+)DOC");
+  }
+};
+
+class GetPlacesInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    for (auto &o_name : op_desc.Output("Out")) {
+      block->FindRecursiveOrCreateVar(o_name).SetType(
+          framework::proto::VarDesc::PLACE_LIST);
+    }
+  }
+};
+
+class GetPlacesInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    // Do nothing
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(get_places, ops::GetPlacesOp, ops::GetPlacesOpProtoMaker,
+                  ops::GetPlacesInferVarType, ops::GetPlacesInferShape,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1436e55b0e13b8b327a61e7c91294fa958b146c4
--- /dev/null
+++ b/paddle/fluid/operators/gru_op.cc
@@ -0,0 +1,224 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/gru_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class GRUOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
+                   "Output(%s) of GRUOp should not be null.", "BatchGate");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"),
+                   "Output(%s) of GRUOp should not be null.",
+                   "BatchResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
+                   "Output(%s) of GRUOp should not be null.", "BatchHidden");
+    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                   "Output(%s) of GRUOp should not be null.", "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    int input_size = input_dims[1];
+    int frame_size = weight_dims[0];
+    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
+                      "The input_size must be 3 times of frame_size in GRUOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_dims[1], frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    if (ctx->HasInput("H0")) {
+      auto h0_dims = ctx->GetInputDim("H0");
+      PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
+                        "The width of H0 must be equal to frame_size.");
+    }
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+    }
+    ctx->SetOutputDim("BatchGate", input_dims);
+    ctx->SetOutputDim("BatchResetHiddenPrev", {input_dims[0], frame_size});
+    ctx->SetOutputDim("BatchHidden", {input_dims[0], frame_size});
+    ctx->SetOutputDim("Hidden", {input_dims[0], frame_size});
+    ctx->ShareLoD("Input", "Hidden");
+  }
+};
+
+class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GRUOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(LoDTensor) The first input is a LodTensor, which supports "
+             "variable-time length input sequence. The underlying tensor in "
+             "this LoDTenosr is a matrix with shape (T X 3D), where, T is the "
+             "total time steps in this mini-batch, D is the hidden size.");
+    AddInput("H0",
+             "(Tensor, optional) The initial hidden state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size, D is the hidden size.")
+        .AsDispensable();
+    AddInput(
+        "Weight",
+        "(Tensor) The learnable hidden-hidden weight matrix with shape "
+        "(D x 3D), where D is the hidden size. The elements continuous in "
+        "memory can be divided into two parts. The first part are weights of "
+        "the update gate and reset gate with shape (D x 2D), and the second "
+        "part are weights of output candidate with shape (D x D).");
+    AddInput("Bias",
+             "(Tensor, optional) Bias vector with shape (1 x 3D) concating "
+             "bias of the update gate, reset gate and output candidate.")
+        .AsDispensable();
+    AddOutput("BatchGate",
+              "(LoDTensor) To compute with batches, sequence data will be "
+              "reorganized into several successive batches each containing "
+              "data from the same time step. The LoDTensor BatchGate contains "
+              "the update gate, reset gate and output candidate values "
+              "organized in batches. The LoD size is 2. The first LoD contains "
+              "the batch offsets and the second LoD contains the indexes in "
+              "the raw sequence data.")
+        .AsIntermediate();
+    AddOutput(
+        "BatchResetHiddenPrev",
+        "(LoDTensor) The reseted hidden state LoDTensor organized in batches. "
+        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
+        "with `BatchGate`.")
+        .AsIntermediate();
+    AddOutput(
+        "BatchHidden",
+        "(LoDTensor) The hidden state LoDTensor organized in batches.  "
+        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
+        "with `BatchGate`.")
+        .AsIntermediate();
+    AddOutput(
+        "Hidden",
+        "(LoDTensor) the hidden state LoDTensor organized in sequences. "
+        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
+        "with `BatchGate`.");
+    AddAttr<std::string>("activation",
+                         "(string, default tanh) "
+                         "The activation type used for output candidate {h}_t.")
+        .SetDefault("tanh");
+    AddAttr<std::string>(
+        "gate_activation",
+        "(string, default sigmoid) "
+        "The activation type used in update gate and reset gate.")
+        .SetDefault("sigmoid");
+    AddAttr<bool>("is_reverse",
+                  "(bool, defalut: False) "
+                  "whether to compute reversed GRU.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+GRU Operator implements part calculations of the complete GRU as following:
+
+$$
+update\_gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+reset\_gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+output\_candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
+$$
+
+@note To implement the complete GRU, fully-connected operator must be used
+before to feed xu, xr and xc as the Input of GRU operator.
+)DOC");
+  }
+};
+
+class GRUGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUGradOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUGradOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
+                   "Input(%s) of GRUGradOp should not be null.", "BatchGate");
+    PADDLE_ENFORCE(ctx->HasInput("BatchResetHiddenPrev"),
+                   "Input(%s) of GRUGradOp should not be null.",
+                   "BatchResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("BatchHidden"),
+                   "Input(%s) of GRUOp should not be null.", "BatchHidden");
+    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
+                   "Input(%s) of GRUGradOp should not be null.", "Hidden");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
+                   "Input(%s@GRAD) of GRUGradOp should not be null.", "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    int input_size = input_dims[1];
+    int frame_size = weight_dims[0];
+    int weight_height = weight_dims[0];
+    int weight_width = weight_dims[1];
+    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
+                      "The input_size must be 3 times of frame_size in GRUOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_height, frame_size,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    PADDLE_ENFORCE_EQ(
+        weight_width, frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    if (ctx->HasInput("H0")) {
+      auto h0_dims = ctx->GetInputDim("H0");
+      PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
+                        "The width of H0 must be equal to frame_size.");
+      auto h0_grad_name = framework::GradVarName("H0");
+      if (ctx->HasOutput(h0_grad_name))
+        ctx->SetOutputDim(h0_grad_name, h0_dims);
+    }
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      auto bias_grad_name = framework::GradVarName("Bias");
+      if (ctx->HasOutput(bias_grad_name))
+        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+    auto input_grad_name = framework::GradVarName("Input");
+    if (ctx->HasOutput(input_grad_name))
+      ctx->SetOutputDim(input_grad_name, input_dims);
+    auto weight_grad_name = framework::GradVarName("Weight");
+    if (ctx->HasOutput(weight_grad_name))
+      ctx->SetOutputDim(weight_grad_name, weight_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp);
+REGISTER_OP_CPU_KERNEL(
+    gru, ops::GRUKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GRUKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    gru_grad, ops::GRUGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GRUGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e908d01d2920af8bdbbdc694944e62a86bad327a
--- /dev/null
+++ b/paddle/fluid/operators/gru_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/gru_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    gru, ops::GRUKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GRUKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    gru_grad, ops::GRUGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GRUGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..37f3ae1a837c77bd5e3696abbd9ae14257a7f5d7
--- /dev/null
+++ b/paddle/fluid/operators/gru_op.h
@@ -0,0 +1,261 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/gru_compute.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence2batch.h"
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
+                             const framework::Tensor& src,
+                             framework::Vector<size_t> index_lod,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  row_shuffle(ctx, src, index_lod, *dst, indexed_src);
+}
+
+template <typename DeviceContext, typename T>
+class GRUKernel : public framework::OpKernel<T> {
+ public:
+  void BatchCompute(const framework::ExecutionContext& context) const {
+    auto* input = context.Input<LoDTensor>("Input");
+    auto* h0 = context.Input<Tensor>("H0");
+    auto* weight = context.Input<Tensor>("Weight");
+    const T* weight_data = weight->data<T>();
+    auto* bias = context.Input<Tensor>("Bias");
+    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(context.GetPlace());
+    auto* batch_reset_hidden_prev =
+        context.Output<LoDTensor>("BatchResetHiddenPrev");
+    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
+    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
+    batch_hidden->mutable_data<T>(context.GetPlace());
+    auto* hidden = context.Output<LoDTensor>("Hidden");
+    hidden->mutable_data<T>(context.GetPlace());
+
+    context.ShareLoD("Input", "Hidden");
+
+    auto hidden_dims = hidden->dims();
+
+    bool is_reverse = context.Attr<bool>("is_reverse");
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    to_batch(dev_ctx, *input, *batch_gate, true, is_reverse);
+
+    if (bias) {
+      math::RowwiseAdd<DeviceContext, T> add_bias;
+      add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
+    }
+
+    int frame_size = hidden_dims[1];
+    math::GRUMetaValue<T> gru_value;
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+    Tensor ordered_h0;
+
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
+    if (h0) {
+      // Since the batch computing for GRU reorders the input sequences
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), *h0, order,
+          &ordered_h0, true);
+      gru_value.prev_out_value = ordered_h0.data<T>();
+    } else {
+      gru_value.prev_out_value = nullptr;
+    }
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    auto active_node = math::detail::GetActivationType(
+        context.Attr<std::string>("activation"));
+    auto active_gate = math::detail::GetActivationType(
+        context.Attr<std::string>("gate_activation"));
+    for (size_t n = 0; n < num_batch; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+      int cur_batch_size = bend - bstart;
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
+      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+      gru_value.output_value = hidden_t.data<T>();
+      gru_value.gate_value = gate_t.data<T>();
+      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
+      math::GRUUnitFunctor<DeviceContext, T>::compute(
+          dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
+          active_gate);
+      gru_value.prev_out_value = gru_value.output_value;
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batch_hidden->set_lod(batch_gate->lod());
+    to_seq(dev_ctx, *batch_hidden, *hidden);
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    BatchCompute(context);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GRUGradKernel : public framework::OpKernel<T> {
+ public:
+  void BatchCompute(const framework::ExecutionContext& context) const {
+    auto* h0 = context.Input<Tensor>("H0");
+    auto* weight = context.Input<Tensor>("Weight");
+    const T* weight_data = weight->data<T>();
+    auto* batch_gate = context.Input<LoDTensor>("BatchGate");
+    auto* batch_reset_hidden_prev =
+        context.Input<LoDTensor>("BatchResetHiddenPrev");
+    auto* batch_hidden = context.Input<LoDTensor>("BatchHidden");
+    auto* hidden = context.Input<LoDTensor>("Hidden");
+    auto* hidden_grad =
+        context.Input<LoDTensor>(framework::GradVarName("Hidden"));
+    auto* input_grad =
+        context.Output<LoDTensor>(framework::GradVarName("Input"));
+    auto* h0_grad = context.Output<Tensor>(framework::GradVarName("H0"));
+    auto* weight_grad =
+        context.Output<Tensor>(framework::GradVarName("Weight"));
+    auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto gate_dims = batch_gate->dims();
+    auto hidden_dims = hidden->dims();
+    int frame_size = hidden_dims[1];
+
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad;
+    batch_hidden_grad.mutable_data<T>(hidden_dims, context.GetPlace());
+    batch_gate_grad.mutable_data<T>(gate_dims, context.GetPlace());
+    batch_reset_hidden_prev_grad.mutable_data<T>(hidden_dims,
+                                                 context.GetPlace());
+    math::SetConstant<DeviceContext, T> zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    zero(dev_ctx, &batch_hidden_grad, static_cast<T>(0.0));
+    zero(dev_ctx, &batch_gate_grad, static_cast<T>(0.0));
+    zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
+
+    Tensor ordered_h0, ordered_h0_grad;
+
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
+    if (h0) {
+      ReorderInitState<DeviceContext, T>(dev_ctx, *h0, order, &ordered_h0,
+                                         true);
+    }
+    if (h0_grad) {
+      ordered_h0_grad.mutable_data<T>(h0_grad->dims(), context.GetPlace());
+      zero(context.template device_context<DeviceContext>(), &ordered_h0_grad,
+           static_cast<T>(0.0));
+    }
+
+    bool is_reverse = context.Attr<bool>("is_reverse");
+    batch_hidden_grad.set_lod(batch_hidden->lod());
+    to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
+
+    math::GRUMetaValue<T> gru_value;
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+
+    math::GRUMetaGrad<T> gru_grad;
+    if (weight_grad) {
+      gru_grad.gate_weight_grad =
+          weight_grad->mutable_data<T>(context.GetPlace());
+      zero(dev_ctx, weight_grad, static_cast<T>(0.0));
+      gru_grad.state_weight_grad =
+          weight_grad->data<T>() + 2 * frame_size * frame_size;
+    } else {
+      gru_grad.gate_weight_grad = nullptr;
+      gru_grad.state_weight_grad = nullptr;
+    }
+
+    auto batch_starts = batch_hidden_grad.lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    auto active_node = math::detail::GetActivationType(
+        context.Attr<std::string>("activation"));
+    auto active_gate = math::detail::GetActivationType(
+        context.Attr<std::string>("gate_activation"));
+    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+      int cur_batch_size = bend - bstart;
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      gru_value.gate_value = gate_t.data<T>();
+      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
+      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
+
+      Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
+      gru_grad.output_grad = hidden_grad_t.data<T>();
+      Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
+      gru_grad.gate_grad = gate_grad_t.data<T>();
+      Tensor reset_hidden_prev_grad_t =
+          batch_reset_hidden_prev_grad.Slice(bstart, bend);
+      gru_grad.reset_output_grad = reset_hidden_prev_grad_t.data<T>();
+      if (n == 0) {
+        gru_value.prev_out_value = h0 ? ordered_h0.data<T>() : nullptr;
+        gru_grad.prev_out_grad =
+            h0 && h0_grad ? ordered_h0_grad.data<T>() : nullptr;
+      } else {
+        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
+        Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
+        gru_value.prev_out_value = hidden_prev_t.data<T>();
+        Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
+        gru_grad.prev_out_grad = hidden_prev_grad_t.data<T>();
+      }
+
+      math::GRUUnitGradFunctor<DeviceContext, T>::compute(
+          dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, active_node,
+          active_gate);
+    }
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+      math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+      batch_gate_grad.set_lod(batch_gate->lod());
+      to_seq(dev_ctx, batch_gate_grad, *input_grad);
+    }
+    if (bias_grad) {
+      bias_grad->mutable_data<T>(context.GetPlace());
+      math::ColwiseSum<DeviceContext, T> col_sum;
+      col_sum(dev_ctx, batch_gate_grad, bias_grad);
+    }
+    if (h0 && h0_grad) {
+      ReorderInitState<DeviceContext, T>(dev_ctx, ordered_h0_grad, order,
+                                         h0_grad, false);
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    BatchCompute(context);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..21ad3aeb492ee18d465edea6ec0fca7e49d1366b
--- /dev/null
+++ b/paddle/fluid/operators/gru_unit_op.cc
@@ -0,0 +1,209 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/gru_unit_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class GRUUnitOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUUnitOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("HiddenPrev"),
+                   "Input(%s) of GRUUnitOp should not be null.", "HiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUUnitOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasOutput("Gate"),
+                   "Output(%s) of GRUUnitOp should not be null.", "Gate");
+    PADDLE_ENFORCE(ctx->HasOutput("ResetHiddenPrev"),
+                   "Output(%s) of GRUUnitOp should not be null.",
+                   "ResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                   "Output(%s) of GRUUnitOp should not be null.", "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    int batch_size = input_dims[0];
+    int input_size = input_dims[1];
+    int frame_size = hidden_prev_dims[1];
+    int weight_height = weight_dims[0];
+    int weight_width = weight_dims[1];
+    PADDLE_ENFORCE_EQ(
+        input_size, frame_size * 3,
+        "The input_size must be 3 times of frame_size in GRUUnitOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_height, frame_size,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    PADDLE_ENFORCE_EQ(
+        weight_width, frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+    }
+    ctx->SetOutputDim("Gate", {batch_size, frame_size * 3});
+    ctx->SetOutputDim("ResetHiddenPrev", {batch_size, frame_size});
+    ctx->SetOutputDim("Hidden", {batch_size, frame_size});
+  }
+};
+
+class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GRUUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
+             "input.");
+    AddInput("HiddenPrev",
+             "(Tensor) Matrix with shape [batch_size, frame_size] for the "
+             "states of previous time step.");
+    AddInput(
+        "Weight",
+        "(Tensor) Weight matrix with shape [frame_size, frame_size * 3]. "
+        "The elements continuous in memory can be divided into two parts. "
+        "The first part are weights of the update gate and reset gate "
+        "with shape [frame_size, frame_size * 2], and the second part are "
+        "weights of output candidate with shape [frame_size, frame_size].");
+    AddInput(
+        "Bias",
+        "(Tensor) Bias vector with shape [1, frame_size * 3] concatenating "
+        "bias of the update gate, reset gate and output candidate.")
+        .AsDispensable();
+    AddOutput("Gate",
+              "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
+              "output of update gate, reset gate and output candidate.")
+        .AsIntermediate();
+    AddOutput("ResetHiddenPrev",
+              "(Tensor) Matrix with shape [batch_size, frame_size] for the "
+              "reseted hidden state of previous time step.")
+        .AsIntermediate();
+    AddOutput("Hidden",
+              "(Tensor) The GRU hidden state of the current time step "
+              "with shape [batch_size, frame_size].");
+    AddAttr<int>("activation",
+                 "(enum int, default tanh) "
+                 "The activation type used for output candidate {h}_t.")
+        .SetDefault(tanh)
+        .InEnum({identity, sigmoid, tanh, relu});
+    AddAttr<int>("gate_activation",
+                 "(enum int, default sigmoid) "
+                 "The activation type used in update gate and reset gate.")
+        .SetDefault(sigmoid)
+        .InEnum({identity, sigmoid, tanh, relu});
+    AddComment(R"DOC(
+GRUUnit Operator implements partial calculations of the GRU unit as following:
+
+$$
+update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
+$$
+
+which is same as one time step of GRU Operator.
+
+@note To implement the complete GRU unit, fully-connected operator must be 
+used before to feed xu, xr and xc as the Input of GRUUnit operator.
+
+)DOC");
+  }
+};
+
+class GRUUnitGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUUnitGradOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("HiddenPrev"),
+                   "Input(%s) of GRUUnitGradOp should not be null.",
+                   "HiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUUnitGradOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasInput("Gate"),
+                   "Input(%s) of GRUUnitGradOp should not be null.", "Gate");
+    PADDLE_ENFORCE(ctx->HasInput("ResetHiddenPrev"),
+                   "Input(%s) of GRUUnitGradOp should not be null.",
+                   "ResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
+                   "Input(%s) of GRUUnitGradOp should not be null.", "Hidden");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
+                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
+                   "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    // int batch_size = input_dims[0];
+    int input_size = input_dims[1];
+    int frame_size = hidden_prev_dims[1];
+    int weight_height = weight_dims[0];
+    int weight_width = weight_dims[1];
+    PADDLE_ENFORCE_EQ(
+        input_size, frame_size * 3,
+        "The input_size must be 3 times of frame_size in GRUUnitOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_height, frame_size,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    PADDLE_ENFORCE_EQ(
+        weight_width, frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      auto bias_grad_name = framework::GradVarName("Bias");
+      if (ctx->HasOutput(bias_grad_name))
+        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+    auto input_grad_name = framework::GradVarName("Input");
+    if (ctx->HasOutput(input_grad_name))
+      ctx->SetOutputDim(input_grad_name, input_dims);
+    auto hidden_prev_grad_name = framework::GradVarName("HiddenPrev");
+    if (ctx->HasOutput(hidden_prev_grad_name))
+      ctx->SetOutputDim(hidden_prev_grad_name, hidden_prev_dims);
+    auto weight_grad_name = framework::GradVarName("Weight");
+    if (ctx->HasOutput(weight_grad_name))
+      ctx->SetOutputDim(weight_grad_name, weight_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad,
+            ops::GRUUnitGradOp);
+REGISTER_OP_CPU_KERNEL(
+    gru_unit, ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    gru_unit_grad,
+    ops::GRUUnitGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GRUUnitGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/gru_unit_op.cu b/paddle/fluid/operators/gru_unit_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..88b707fd1314ec5f12b507edc64a56ea9895a9d6
--- /dev/null
+++ b/paddle/fluid/operators/gru_unit_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/gru_unit_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    gru_unit, ops::GRUUnitKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GRUUnitKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    gru_unit_grad,
+    ops::GRUUnitGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GRUUnitGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4031a5a575e59488c1a6cd77c3da88ea6af423e
--- /dev/null
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -0,0 +1,244 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
+
+template <typename DeviceContext, typename T>
+class GRUUnitKernel : public framework::OpKernel<T> {
+ public:
+  template <typename Device, typename X, typename Y>
+  void ActCompute(const int act_type, const Device& d, X x, Y y) const {
+    if (act_type == identity)
+      y.device(d) = x;
+    else if (act_type == sigmoid)
+      SigmoidFunctor<T>()(d, x, y);
+    else if (act_type == tanh)
+      TanhFunctor<T>()(d, x, y);
+    else if (act_type == relu)
+      ReluFunctor<T>()(d, x, y);
+    else
+      PADDLE_THROW("unsupported activation type");
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("Input");
+    auto* hidden_prev = context.Input<Tensor>("HiddenPrev");
+    auto* weight = context.Input<Tensor>("Weight");
+    auto* bias = context.Input<Tensor>("Bias");
+    auto* gate = context.Output<Tensor>("Gate");
+    gate->mutable_data<T>(context.GetPlace());
+    auto* reset_hidden_prev = context.Output<Tensor>("ResetHiddenPrev");
+    reset_hidden_prev->mutable_data<T>(context.GetPlace());
+    auto* hidden = context.Output<Tensor>("Hidden");
+    hidden->mutable_data<T>(context.GetPlace());
+
+    int batch_size = input->dims()[0];
+    int frame_size = hidden_prev->dims()[1];
+
+    auto x = EigenMatrix<T>::From(*input);
+    auto h_p = EigenMatrix<T>::From(*hidden_prev);
+    auto g = EigenMatrix<T>::From(*gate);
+    auto r_h_p = EigenMatrix<T>::From(*reset_hidden_prev);
+    auto h = EigenMatrix<T>::From(*hidden);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    // calculate unactivated gate outputs
+    if (bias) {
+      auto b = EigenMatrix<T>::From(*bias);
+      g.device(place) = x +
+                        b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
+                            .broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
+    } else {
+      g.device(place) = x;
+    }
+    const T* hidden_prev_data = hidden_prev->data<T>();
+    const T* weight_data = weight->data<T>();
+    T* gate_data = gate->data<T>();
+    T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
+    math::gemm<DeviceContext, T>(
+        context.template device_context<DeviceContext>(), false, false,
+        batch_size, 2 * frame_size, frame_size, 1, hidden_prev_data, frame_size,
+        weight_data, frame_size * 2, 1, gate_data, frame_size * 3);
+
+    // calculate activited gate
+    Eigen::array<int, 2> extents({{batch_size, frame_size}});
+    Eigen::array<int, 2> u_offsets({{0, 0}});
+    ActCompute(context.Attr<int>("gate_activation"), place,
+               g.slice(u_offsets, extents), g.slice(u_offsets, extents));
+    auto u = g.slice(u_offsets, extents);  // update gate
+    Eigen::array<int, 2> r_offsets({{0, frame_size}});
+    ActCompute(context.Attr<int>("gate_activation"), place,
+               g.slice(r_offsets, extents), g.slice(r_offsets, extents));
+    auto r = g.slice(r_offsets, extents);  // reset gate
+    r_h_p.device(place) = r * h_p;         // reset previous hidden state
+    math::gemm<DeviceContext, T>(
+        context.template device_context<DeviceContext>(), false, false,
+        batch_size, frame_size, frame_size, 1, reset_hidden_prev_data,
+        frame_size, weight_data + frame_size * frame_size * 2, frame_size, 1,
+        gate_data + frame_size * 2, frame_size * 3);
+
+    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
+    ActCompute(context.Attr<int>("activation"), place,
+               g.slice(c_offsets, extents), g.slice(c_offsets, extents));
+    auto c = g.slice(c_offsets, extents);  // output candidate
+
+    // calculate final output
+    h.device(place) = u * (c - h_p) + h_p;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GRUUnitGradKernel : public framework::OpKernel<T> {
+ public:
+  template <typename Device, typename X, typename Y, typename DX, typename DY>
+  void ActGradCompute(const int act_type, const Device& d, X x, Y y, DX dx,
+                      DY dy) const {
+    // x is dummy and won't be used even in Relu(use y instead)
+    if (act_type == identity)
+      dx.device(d) = dy;
+    else if (act_type == sigmoid)
+      SigmoidGradFunctor<T>()(d, x, y, dy, dx);
+    else if (act_type == tanh)
+      TanhGradFunctor<T>()(d, x, y, dy, dx);
+    else if (act_type == relu)
+      ReluGradFunctor<T>()(d, x, y, dy, dx);
+    else
+      PADDLE_THROW("unsupported activation type");
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("Input");
+    auto* hidden_prev = context.Input<Tensor>("HiddenPrev");
+    auto* weight = context.Input<Tensor>("Weight");
+    auto* gate = context.Input<Tensor>("Gate");
+    auto* reset_hidden_prev = context.Input<Tensor>("ResetHiddenPrev");
+    auto* hidden_grad = context.Input<Tensor>(framework::GradVarName("Hidden"));
+    auto* input_grad = context.Output<Tensor>(framework::GradVarName("Input"));
+    auto* hidden_prev_grad =
+        context.Output<Tensor>(framework::GradVarName("HiddenPrev"));
+    auto* weight_grad =
+        context.Output<Tensor>(framework::GradVarName("Weight"));
+    auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
+    Tensor gate_grad;
+    Tensor reset_hidden_prev_grad;
+
+    const T* hidden_prev_data = hidden_prev->data<T>();
+    const T* weight_data = weight->data<T>();
+    T* gate_grad_data =
+        gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
+    const T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
+    T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.mutable_data<T>(
+        reset_hidden_prev->dims(), context.GetPlace());
+
+    auto h_p = EigenMatrix<T>::From(*hidden_prev);
+    auto g = EigenMatrix<T>::From(*gate);
+    auto d_h = EigenMatrix<T>::From(*hidden_grad);
+    auto d_g = EigenMatrix<T>::From(gate_grad);
+    auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    int batch_size = input->dims()[0];
+    int frame_size = hidden_prev->dims()[1];
+
+    Eigen::array<int, 2> extents({{batch_size, frame_size}});
+    Eigen::array<int, 2> u_offsets({{0, 0}});
+    auto u = g.slice(u_offsets, extents);  // update gate
+    Eigen::array<int, 2> r_offsets({{0, frame_size}});
+    auto r = g.slice(r_offsets, extents);  // reset gate
+    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
+    auto c = g.slice(c_offsets, extents);  // output candidate
+
+    // backward for unactivated update gate
+    ActGradCompute(context.Attr<int>("gate_activation"), place, u, u,
+                   d_g.slice(u_offsets, extents), d_h * (c - h_p));
+    // backward for unactivated output candidate
+    ActGradCompute(context.Attr<int>("activation"), place, c, c,
+                   d_g.slice(c_offsets, extents), d_h * u);
+    // backward for reset_hidden_prev
+    math::gemm<DeviceContext, T>(
+        context.template device_context<DeviceContext>(), false, true,
+        batch_size, frame_size, frame_size, 1, gate_grad_data + frame_size * 2,
+        frame_size * 3, weight_data + frame_size * frame_size * 2, frame_size,
+        0, reset_hidden_prev_grad_data, frame_size);
+    // backward for unactivated reset gate
+    ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
+                   d_g.slice(r_offsets, extents), d_r_h_p * h_p);
+    // backward for weight
+    if (weight_grad) {
+      T* weight_grad_data = weight_grad->mutable_data<T>(context.GetPlace());
+      // backward for state_weight
+      math::gemm<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), true, false,
+          frame_size, frame_size, batch_size, 1, reset_hidden_prev_data,
+          frame_size, gate_grad_data + frame_size * 2, frame_size * 3, 0,
+          weight_grad_data + frame_size * frame_size * 2, frame_size);
+
+      // backward for update_gate_weight and reset_gate_weight
+      math::gemm<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), true, false,
+          frame_size, frame_size * 2, batch_size, 1, hidden_prev_data,
+          frame_size, gate_grad_data, frame_size * 3, 0, weight_grad_data,
+          frame_size * 2);
+    }
+    // backward for hidden_prev
+    if (hidden_prev_grad) {
+      T* hidden_prev_grad_data =
+          hidden_prev_grad->mutable_data<T>(context.GetPlace());
+      auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
+      d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u);
+      math::gemm<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), false, true,
+          batch_size, frame_size, frame_size * 2, 1, gate_grad_data,
+          frame_size * 3, weight_data, frame_size * 2, 1, hidden_prev_grad_data,
+          frame_size);
+    }
+    // backward for input
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+      auto d_x = EigenMatrix<T>::From(*input_grad);
+      d_x.device(place) = d_g;
+    }
+    // backward for bias
+    if (bias_grad) {
+      bias_grad->mutable_data<T>(context.GetPlace());
+      auto d_b = EigenVector<T>::Flatten(*bias_grad);
+      d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f644c22c9f1bdde6edc0126186361baccfbfcfb0
--- /dev/null
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/hinge_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class HingeLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) must be initialized.");
+
+    auto pred_dims = ctx->GetInputDim("Logits");
+    auto label_dims = ctx->GetInputDim("Labels");
+
+    PADDLE_ENFORCE_EQ(pred_dims, label_dims);
+    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
+                      "The rank of Input(Logits) must be 2 and the shape is "
+                      "[batch_size, 1].");
+    PADDLE_ENFORCE_EQ(pred_dims[1], 1,
+                      "Each row of Input(Logits) contains a real value, "
+                      "so the 2nd dimension of Input(Logits) must be 1.");
+
+    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
+    ctx->ShareLoD("Logits", "Loss");
+  }
+};
+
+template <typename AttrType>
+class HingeLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  HingeLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Logits",
+             "The input value (Logits) of Hinge loss op."
+             "Logits is a 2-D tensor with shape [batch_size, 1].");
+    AddInput("Labels",
+             "The target value (Labels) of Hinge loss op."
+             "Labels is a 2-D tensor with shape [batch_size, 1].");
+    AddOutput("Loss",
+              "The output tensor with shape [batch_size, 1] "
+              "which represents the hinge loss.");
+    AddComment(R"DOC(
+HingeLoss Operator.
+
+Let x be a logit (prediction) and y be the actual label. The logit can
+take any values from (-inf, inf), but the labels should be either -1 or 1.
+Then, the hinge loss is computed as follows:
+
+$$
+L_(x, y) = max(1 - y.x, 0) 
+$$
+
+Note that the labels passed as input will have values as either 0 or 1.
+
+)DOC");
+  }
+};
+
+class HingeLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@GRAD) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
+                   "Input(Logits@GRAD) should not be null.");
+
+    auto pred_dims = ctx->GetInputDim("Logits");
+    auto lab_dims = ctx->GetInputDim("Labels");
+    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
+
+    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
+
+    auto pred_grad_name = framework::GradVarName("Logits");
+    ctx->SetOutputDim(pred_grad_name, pred_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
+            hinge_loss_grad, ops::HingeLossGradOp);
+REGISTER_OP_CPU_KERNEL(
+    hinge_loss,
+    ops::HingeLossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    hinge_loss_grad,
+    ops::HingeLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/hinge_loss_op.cu b/paddle/fluid/operators/hinge_loss_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cb53a9b7f4aaeeee71ed81c507feb0e9c946a541
--- /dev/null
+++ b/paddle/fluid/operators/hinge_loss_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/hinge_loss_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    hinge_loss,
+    ops::HingeLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    hinge_loss_grad,
+    ops::HingeLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/hinge_loss_op.h b/paddle/fluid/operators/hinge_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e924d236ea1d3208a8f425f76be8d455714a51f
--- /dev/null
+++ b/paddle/fluid/operators/hinge_loss_op.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class HingeLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* pred = context.Input<framework::Tensor>("Logits");
+    auto* label = context.Input<framework::Tensor>("Labels");
+    auto* loss = context.Output<framework::Tensor>("Loss");
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    auto x = framework::EigenVector<T>::Flatten(*pred);
+    auto y = framework::EigenVector<T>::Flatten(*label);
+    loss->mutable_data<T>(context.GetPlace());
+    auto l = framework::EigenVector<T>::Flatten(*loss);
+    l.device(place) =
+        (static_cast<T>(1) - x * (static_cast<T>(2) * y - static_cast<T>(1)))
+            .cwiseMax(static_cast<T>(0));
+  }
+};
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class HingeLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* pred = context.Input<framework::Tensor>("Logits");
+    auto* label = context.Input<framework::Tensor>("Labels");
+    auto* dloss =
+        context.Input<framework::Tensor>(framework::GradVarName("Loss"));
+    auto* dpred =
+        context.Output<framework::Tensor>(framework::GradVarName("Logits"));
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    auto x = framework::EigenVector<T>::Flatten(*pred);
+    auto y = framework::EigenVector<T>::Flatten(*label);
+    auto dl = framework::EigenVector<T>::Flatten(*dloss);
+
+    if (dpred) {
+      dpred->mutable_data<T>(context.GetPlace());
+      auto dx = framework::EigenVector<T>::Flatten(*dpred);
+      auto alt_labels = static_cast<T>(2) * y - static_cast<T>(1);
+      dx.device(place) =
+          dl * ((x * alt_labels) < static_cast<T>(1)).template cast<T>() *
+          (-alt_labels);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dc1f609dcfa23dff82812e72c16b1d62a93ca9a6
--- /dev/null
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -0,0 +1,131 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/huber_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class HuberLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must be initialized.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_EQ(x_dims, y_dims);
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2,
+                      "The rank of Input(X) must be 2 and the shape is "
+                      "[batch_size, 1].");
+    PADDLE_ENFORCE_EQ(x_dims[1], 1,
+                      "Each row of Input(X) contains a real value, "
+                      "so the 2nd dimension of Input(X) must be 1.");
+
+    ctx->SetOutputDim("Residual", x_dims);
+    ctx->SetOutputDim("Out", {x_dims[0], 1});
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+template <typename AttrType>
+class HuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  HuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input value of huber loss op."
+             "X is a 2-D tensor with shape [batch_size, 1].");
+    AddInput("Y",
+             "The target value of huber loss op."
+             "Y is a 2-D tensor with shape [batch_size, 1].");
+    AddOutput("Residual",
+              "Intermediate tensor to cache residual value between Y and X."
+              "The shape is same as Input(X) and will be reused in backward.")
+        .AsIntermediate();
+    AddOutput("Out",
+              "The output tensor with shape [batch_size, 1] "
+              "which represents the huber loss.");
+    AddAttr<AttrType>("delta", "Hyper parameter in huber loss.");
+    AddComment(R"DOC(
+HuberLoss Operator.
+
+Huber loss is a loss function used in robust regression. We define X as the
+input value and Y as the target value. Huber loss can evaluate the fitness of
+X to Y. Different from MSE loss, Huber loss is more robust for outliers. The
+shape of X and Y are [batch_size, 1]. The equation is:
+
+$$
+Out_{\delta}(X, Y)_i =
+\begin{cases}
+0.5 * (Y_i - X_i)^2,
+\quad |Y_i - X_i| \leq \delta \\
+\delta * (|Y_i - X_i| - 0.5 * \delta),
+\quad otherwise
+\end{cases}
+$$
+
+In the above equation, $Out_\delta(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
+element of Out, X and Y.
+
+)DOC");
+  }
+};
+
+class HuberLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Residual"),
+                   "Input(Residual) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto residual_dims = ctx->GetInputDim("Residual");
+    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_EQ(residual_dims, x_dims);
+    PADDLE_ENFORCE_EQ(out_grad_dims, x_dims);
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
+            huber_loss_grad, ops::HuberLossGradOp);
+REGISTER_OP_CPU_KERNEL(
+    huber_loss,
+    ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    huber_loss_grad,
+    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/huber_loss_op.cu b/paddle/fluid/operators/huber_loss_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ef5120c69d4fda533625ace9bab504be39385ec9
--- /dev/null
+++ b/paddle/fluid/operators/huber_loss_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/huber_loss_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    huber_loss,
+    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    huber_loss_grad,
+    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/huber_loss_op.h b/paddle/fluid/operators/huber_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..caca89fcf63d27c7717de522e38f6ff0cab0d8f6
--- /dev/null
+++ b/paddle/fluid/operators/huber_loss_op.h
@@ -0,0 +1,121 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename T>
+struct HuberLossForward {
+  HOSTDEVICE HuberLossForward(const T& delta) : delta(delta) {}
+
+  HOSTDEVICE T operator()(const T& val) const {
+    T abs_val = std::abs(val);
+    if (abs_val <= delta) {
+      return static_cast<T>(0.5) * val * val;
+    } else {
+      return delta * (abs_val - static_cast<T>(0.5) * delta);
+    }
+  }
+
+  T delta;
+};
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class HuberLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto* in1 = context.Input<Tensor>("Y");
+    auto* out0 = context.Output<Tensor>("Residual");
+    auto* out1 = context.Output<Tensor>("Out");
+    auto delta = static_cast<T>(context.Attr<AttrType>("delta"));
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    auto x = EigenVector<T>::Flatten(*in0);
+    auto y = EigenVector<T>::Flatten(*in1);
+    out0->mutable_data<T>(context.GetPlace());
+    auto residual = EigenVector<T>::Flatten(*out0);
+    residual.device(place) = y - x;
+    out1->mutable_data<T>(context.GetPlace());
+    auto loss = EigenVector<T>::Flatten(*out1);
+    loss.device(place) = residual.unaryExpr(HuberLossForward<T>(delta));
+  }
+};
+
+template <typename T>
+struct HuberLossBackward {
+  HOSTDEVICE HuberLossBackward(const T& delta, T sign)
+      : sign(sign), delta(delta) {}
+
+  HOSTDEVICE T operator()(const T& val) const {
+    T abs_val = std::abs(val);
+    if (abs_val <= delta) {
+      return sign * val;
+    } else {
+      if (val > 0) {
+        return sign * delta;
+      } else {
+        return -1 * sign * delta;
+      }
+    }
+  }
+
+  T sign;
+  T delta;
+};
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class HuberLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("Residual");
+    auto* in1 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* out1 = context.Output<Tensor>(framework::GradVarName("Y"));
+    auto delta = static_cast<T>(context.op().Attr<AttrType>("delta"));
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    auto residual = EigenVector<T>::Flatten(*in0);
+    auto out_grad = EigenVector<T>::Flatten(*in1);
+
+    if (out0) {
+      out0->mutable_data<T>(context.GetPlace());
+      auto x_grad = EigenVector<T>::Flatten(*out0);
+      x_grad.device(place) =
+          out_grad * residual.unaryExpr(HuberLossBackward<T>(delta, -1.0));
+    }
+
+    if (out1) {
+      out1->mutable_data<T>(context.GetPlace());
+      auto y_grad = EigenVector<T>::Flatten(*out1);
+      y_grad.device(place) =
+          out_grad * residual.unaryExpr(HuberLossBackward<T>(delta, 1.0));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..936e5fe49eda40dff6d8aa5fd626d443ee8dbe75
--- /dev/null
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/im2sequence_op.h"
+
+namespace paddle {
+namespace operators {
+
+class Im2SequenceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of Im2SequenceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of Im2SequenceOp op should not be null.");
+
+    auto in_dim = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(in_dim.size(), 4,
+                      "Input(X) format must be 4D tensor, eg., NCHW.");
+
+    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
+    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+
+    int batch_size = in_dim[0];
+    int img_channels = in_dim[1];
+    int img_height = in_dim[2];
+    int img_width = in_dim[3];
+
+    int output_height = OutputSize(img_height, kernels[0], paddings[0],
+                                   paddings[2], strides[0]);
+    int output_width =
+        OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]);
+
+    ctx->SetOutputDim("Out", {batch_size * output_height * output_width,
+                              img_channels * kernels[0] * kernels[1]});
+  }
+};
+
+class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Im2SequenceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor) The input tensor has NCHW format."
+             "N: batch size"
+             "C: channels"
+             "H: height"
+             "W: width");
+    AddOutput("Out", "(LodTensor) The output data of im2sequence op,");
+    AddAttr<std::vector<int>>("kernels",
+                              "(vector<int>), the "
+                              "kernels(kernel_height, kernel_width)");
+    AddAttr<std::vector<int>>("strides",
+                              "(vector<int> default:{1, 1}), the "
+                              "strides(h_stride, w_stride)")
+        .SetDefault({1, 1});
+    AddAttr<std::vector<int>>("paddings",
+                              "(vector<int> default:{0, 0, 0, 0}), the "
+                              "paddings(up_pad, left_pad, down_pad, right_pad)")
+        .SetDefault({0, 0, 0, 0});
+    AddComment(R"DOC(
+This op uses kernels to scan images and converts these images to sequences.
+After expanding, The number of time steps are output_height * output_width
+and the dimension of each time step is kernel_height * kernel_width * channels,
+in which:
+
+output_height =
+    1 + (padding_height + padding_down + img_height - kernel_height + stride_height - 1) /
+            stride_height;
+output_width =
+    1 + (padding_left + padding+right + img_width - kernel_width + stride_width - 1) /
+            stride_width;
+
+This op can be used after convolution neural network, and before recurrent neural network.
+
+Given:
+
+x = [[[[ 6.  2.  1.]
+       [ 8.  3.  5.]
+       [ 0.  2.  6.]]
+
+      [[ 2.  4.  4.]
+       [ 6.  3.  0.]
+       [ 6.  4.  7.]]]
+
+     [[[ 6.  7.  1.]
+       [ 5.  7.  9.]
+       [ 2.  4.  8.]]
+
+      [[ 1.  2.  1.]
+       [ 1.  3.  5.]
+       [ 9.  0.  8.]]]]
+x.dims = {2, 2, 3, 3}
+
+And:
+
+kernels = [2, 2]
+strides = [1, 1]
+paddings = [0, 0, 0, 0]
+
+Then:
+
+output.data = [[ 6.  2.  8.  3.  2.  4.  6.  3.]
+               [ 2.  1.  3.  5.  4.  4.  3.  0.]
+               [ 8.  3.  0.  2.  6.  3.  6.  4.]
+               [ 3.  5.  2.  6.  3.  0.  4.  7.]
+               [ 6.  7.  5.  7.  1.  2.  1.  3.]
+               [ 7.  1.  7.  9.  2.  1.  3.  5.]
+               [ 5.  7.  2.  4.  1.  3.  9.  0.]
+               [ 7.  9.  4.  8.  3.  5.  0.  8.]]
+output.dims = {8, 9}
+output.lod = [[0, 4, 8]]
+
+)DOC");
+  }
+};
+
+class Im2SequenceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
+            im2sequence_grad, ops::Im2SequenceGradOp);
+REGISTER_OP_CPU_KERNEL(
+    im2sequence,
+    ops::Im2SequenceKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    im2sequence_grad,
+    ops::Im2SequenceGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/im2sequence_op.cu b/paddle/fluid/operators/im2sequence_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1e7bf4631224620ad5c65f750ed0c0c22e936dcf
--- /dev/null
+++ b/paddle/fluid/operators/im2sequence_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/im2sequence_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    im2sequence,
+    ops::Im2SequenceKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    im2sequence_grad,
+    ops::Im2SequenceGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..59456f0ea2996bb20bd48806a7258e31518a5ea3
--- /dev/null
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   You may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+inline int OutputSize(int input_size, int filter_size, int padding_0,
+                      int padding_1, int stride) {
+  const int output_size =
+      (input_size + padding_0 + padding_1 - filter_size) / stride + 1;
+  return output_size;
+}
+
+template <typename DeviceContext, typename T>
+class Im2SequenceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* in = ctx.Input<Tensor>("X");
+    LoDTensor* out = ctx.Output<LoDTensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+    // TODO(wanghaoshuang): Add layout checker after 'set_layout'
+    // being available for python API
+    // PADDLE_ENFORCE_EQ(in->layout(), framework::DataLayout::kNCHW,
+    //                  "Input(X) layout must be NCHW");
+    auto in_dim = in->dims();
+    int batch_size = in_dim[0];
+    int img_channels = in_dim[1];
+    int img_height = in_dim[2];
+    int img_width = in_dim[3];
+
+    auto kernels = ctx.Attr<std::vector<int>>("kernels");
+    auto strides = ctx.Attr<std::vector<int>>("strides");
+    auto paddings = ctx.Attr<std::vector<int>>("paddings");
+    int output_height = OutputSize(img_height, kernels[0], paddings[0],
+                                   paddings[2], strides[0]);
+    int output_width =
+        OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]);
+
+    const std::vector<int> dilations({1, 1});
+
+    auto out_dims = out->dims();
+    out->Resize({batch_size, out->numel() / batch_size});
+    for (int i = 0; i < batch_size; i++) {
+      const Tensor src =
+          in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+      Tensor dst = out->Slice(i, i + 1).Resize(
+          {output_height, output_width, img_channels, kernels[0], kernels[1]});
+
+      math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      f(dev_ctx, src, dilations, strides, paddings, &dst);
+    }
+    out->Resize(out_dims);
+
+    // set lod information
+    // TODO(wanghaoshuang): Move this to InferShape
+    framework::LoD lod(1);
+    lod[0].reserve(batch_size + 1);
+    for (int i = 0, offset = 0; i < batch_size + 1; ++i) {
+      lod[0].push_back(offset);
+      offset += output_height * output_width;
+    }
+    out->set_lod(lod);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class Im2SequenceGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    Tensor* d_out =
+        const_cast<Tensor*>(ctx.Input<Tensor>(framework::GradVarName("Out")));
+    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    d_x->mutable_data<T>(ctx.GetPlace());
+
+    auto x_v = framework::EigenVector<T>::Flatten(*d_x);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    x_v.device(place) = x_v.constant(0.0);
+
+    auto in_dim = in->dims();
+    int batch_size = in_dim[0];
+    int img_channels = in_dim[1];
+    int img_height = in_dim[2];
+    int img_width = in_dim[3];
+
+    auto kernels = ctx.Attr<std::vector<int>>("kernels");
+    auto strides = ctx.Attr<std::vector<int>>("strides");
+    auto paddings = ctx.Attr<std::vector<int>>("paddings");
+    int output_height = OutputSize(img_height, kernels[0], paddings[0],
+                                   paddings[2], strides[0]);
+    int output_width =
+        OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]);
+
+    const std::vector<int> dilations({1, 1});
+
+    auto d_out_dims = d_out->dims();
+    d_out->Resize({batch_size, d_out->numel() / batch_size});
+    for (int i = 0; i < batch_size; i++) {
+      Tensor dst =
+          d_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+      const Tensor src = d_out->Slice(i, i + 1).Resize(
+          {output_height, output_width, img_channels, kernels[0], kernels[1]});
+      math::Col2ImFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      f(dev_ctx, src, dilations, strides, paddings, &dst);
+    }
+    d_out->Resize(d_out_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/images/batch_norm_fork.dot b/paddle/fluid/operators/images/batch_norm_fork.dot
similarity index 100%
rename from paddle/operators/images/batch_norm_fork.dot
rename to paddle/fluid/operators/images/batch_norm_fork.dot
diff --git a/paddle/operators/images/batch_norm_fork.png b/paddle/fluid/operators/images/batch_norm_fork.png
similarity index 100%
rename from paddle/operators/images/batch_norm_fork.png
rename to paddle/fluid/operators/images/batch_norm_fork.png
diff --git a/paddle/operators/images/batch_norm_op_kernel.png b/paddle/fluid/operators/images/batch_norm_op_kernel.png
similarity index 100%
rename from paddle/operators/images/batch_norm_op_kernel.png
rename to paddle/fluid/operators/images/batch_norm_op_kernel.png
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d488067b254c37515c6bdb9a4589aad311f344f
--- /dev/null
+++ b/paddle/fluid/operators/increment_op.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class IncrementInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of IncrementOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of IncrementOp should not be null.");
+    PADDLE_ENFORCE_EQ(1, framework::product(ctx->GetInputDim("X")));
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+};
+
+struct IncrementFunctor {
+  IncrementFunctor(const framework::LoDTensor &x, framework::LoDTensor *out,
+                   float value)
+      : x_(x), out_(out), value_(value) {}
+
+  template <typename T>
+  void operator()() const {
+    *out_->data<T>() = *x_.data<T>() + static_cast<T>(value_);
+  }
+
+  const framework::LoDTensor &x_;
+  framework::LoDTensor *out_;
+  float value_;
+};
+
+class IncrementOp : public framework::OperatorBase {
+ public:
+  IncrementOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto &out =
+        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+
+    PADDLE_ENFORCE(platform::is_cpu_place(x.place()));
+    out.Resize(x.dims());
+    out.mutable_data(x.place(), x.type());
+    float value = Attr<float>("step");
+    VLOG(10) << Output("Out") << " increase " << Input("X") << " with "
+             << value;
+    framework::VisitDataType(framework::ToDataType(out.type()),
+                             IncrementFunctor(x, &out, value));
+  }
+};
+
+class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  IncrementOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) The input tensor of increment operator");
+    AddOutput("Out", "(Tensor) The output tensor of increment operator.");
+    AddAttr<float>("step",
+                   "(float, default 1.0) "
+                   "The step size by which the "
+                   "input tensor will be incremented.")
+        .SetDefault(1.0);
+    AddComment(R"DOC(
+Increment Operator.
+
+The equation is: 
+$$Out = X + step$$
+
+)DOC");
+  }
+};
+
+class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("increment");
+    grad_op->SetInput("X", Output("Out"));
+    grad_op->SetOutput("Out", Input("X"));
+    grad_op->SetAttr("step", -boost::get<float>(GetAttr("step")));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementInferShape,
+                  ops::IncrementOpMaker, ops::IncrementGradOpMaker);
diff --git a/paddle/fluid/operators/iou_similarity_op.cc b/paddle/fluid/operators/iou_similarity_op.cc
new file mode 100755
index 0000000000000000000000000000000000000000..c2e452cdfaa71cae53c2bfe259bae7f80cd259d7
--- /dev/null
+++ b/paddle/fluid/operators/iou_similarity_op.cc
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/iou_similarity_op.h"
+
+namespace paddle {
+namespace operators {
+
+class IOUSimilarityOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of IOUSimilarityOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of IOUSimilarityOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The rank of Input(X) must be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[1], 4UL, "The shape of X is [N, 4]");
+    PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The rank of Input(Y) must be 2.");
+    PADDLE_ENFORCE_EQ(y_dims[1], 4UL, "The shape of Y is [M, 4]");
+
+    ctx->ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("Out", framework::make_ddim({x_dims[0], y_dims[0]}));
+  }
+};
+
+class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  IOUSimilarityOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor, default LoDTensor<float>) "
+             "Box list X is a 2-D LoDTensor with shape [N, 4] holds N boxes, "
+             "each box is represented as [xmin, ymin, xmax, ymax], "
+             "the shape of X is [N, 4]. [xmin, ymin] is the left top "
+             "coordinate of the box if the input is image feature map, they "
+             "are close to the origin of the coordinate system. "
+             "[xmax, ymax] is the right bottom coordinate of the box. "
+             "This tensor can contain LoD information to represent a batch "
+             "of inputs. One instance of this batch can contain different "
+             "numbers of entities.");
+    AddInput("Y",
+             "(Tensor, default Tensor<float>) "
+             "Box list Y holds M boxes, each box is represented as "
+             "[xmin, ymin, xmax, ymax], the shape of X is [N, 4]. "
+             "[xmin, ymin] is the left top coordinate of the box if the "
+             "input is image feature map, and [xmax, ymax] is the right "
+             "bottom coordinate of the box.");
+
+    AddOutput("Out",
+              "(LoDTensor, the lod is same as input X) The output of "
+              "iou_similarity op, a tensor with shape [N, M] "
+              "representing pairwise iou scores.");
+
+    AddComment(R"DOC(
+IOU Similarity Operator.
+Computes intersection-over-union (IOU) between two box lists.
+ Box list 'X' should be a LoDTensor and 'Y' is a common Tensor,
+ boxes in 'Y' are shared by all instance of the batched inputs of X.
+ Given two boxes A and B, the calculation of IOU is as follows:
+
+$$
+IOU(A, B) = 
+\frac{area(A\cap B)}{area(A)+area(B)-area(A\cap B)}
+$$
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(iou_similarity, ops::IOUSimilarityOp,
+                             ops::IOUSimilarityOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    iou_similarity,
+    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/iou_similarity_op.cu b/paddle/fluid/operators/iou_similarity_op.cu
new file mode 100755
index 0000000000000000000000000000000000000000..f8df1f4aa4c4894b59fe373a9e0cb697dfb96b62
--- /dev/null
+++ b/paddle/fluid/operators/iou_similarity_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/iou_similarity_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    iou_similarity,
+    ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/iou_similarity_op.h b/paddle/fluid/operators/iou_similarity_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fb1b5f70703f9c88a532644d88a8b5df45404f0
--- /dev/null
+++ b/paddle/fluid/operators/iou_similarity_op.h
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/for_range.h"
+
+template <typename T>
+inline HOSTDEVICE T IOUSimilarity(T xmin1, T ymin1, T xmax1, T ymax1, T xmin2,
+                                  T ymin2, T xmax2, T ymax2) {
+  constexpr T zero = static_cast<T>(0);
+  T area1 = (ymax1 - ymin1) * (xmax1 - xmin1);
+  T area2 = (ymax2 - ymin2) * (xmax2 - xmin2);
+  T inter_xmax = xmax1 > xmax2 ? xmax2 : xmax1;
+  T inter_ymax = ymax1 > ymax2 ? ymax2 : ymax1;
+  T inter_xmin = xmin1 > xmin2 ? xmin1 : xmin2;
+  T inter_ymin = ymin1 > ymin2 ? ymin1 : ymin2;
+  T inter_height = inter_ymax - inter_ymin;
+  T inter_width = inter_xmax - inter_xmin;
+  inter_height = inter_height > zero ? inter_height : zero;
+  inter_width = inter_width > zero ? inter_width : zero;
+  T inter_area = inter_width * inter_height;
+  T union_area = area1 + area2 - inter_area;
+  T sim_score = inter_area / union_area;
+  return sim_score;
+}
+
+template <typename T>
+struct IOUSimilarityFunctor {
+  IOUSimilarityFunctor(const T* x, const T* y, T* z, int cols)
+      : x_(x), y_(y), z_(z), cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    T x_min1 = x_[row_id * 4];
+    T y_min1 = x_[row_id * 4 + 1];
+    T x_max1 = x_[row_id * 4 + 2];
+    T y_max1 = x_[row_id * 4 + 3];
+    for (size_t i = 0; i < cols_; ++i) {
+      T x_min2 = y_[i * 4];
+      T y_min2 = y_[i * 4 + 1];
+      T x_max2 = y_[i * 4 + 2];
+      T y_max2 = y_[i * 4 + 3];
+
+      T sim = IOUSimilarity(x_min1, y_min1, x_max1, y_max1, x_min2, y_min2,
+                            x_max2, y_max2);
+
+      z_[row_id * cols_ + i] = sim;
+    }
+  }
+  const T* x_;
+  const T* y_;
+  T* z_;
+  const size_t cols_;
+};
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class IOUSimilarityKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const framework::LoDTensor* in_x = ctx.Input<framework::LoDTensor>("X");
+    const framework::Tensor* in_y = ctx.Input<framework::Tensor>("Y");
+    framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
+
+    int x_n = in_x->dims()[0];
+    int y_n = in_y->dims()[0];
+    IOUSimilarityFunctor<T> functor(in_x->data<T>(), in_y->data<T>(),
+                                    out->mutable_data<T>(ctx.GetPlace()), y_n);
+
+    platform::ForRange<DeviceContext> for_range(
+        static_cast<const DeviceContext&>(ctx.device_context()), x_n);
+    for_range(functor);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea424018d66dac85d5a4ad75cbf5199064d52848
--- /dev/null
+++ b/paddle/fluid/operators/is_empty_op.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+constexpr char kInput[] = "X";
+constexpr char kOutput[] = "Out";
+
+class IsEmptyOp : public framework::OperatorBase {
+ public:
+  IsEmptyOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    // get input
+    auto *var = scope.FindVar(Input(kInput));
+    PADDLE_ENFORCE_NOT_NULL(var);
+    auto &tensor = var->Get<framework::LoDTensor>();
+    // get output
+    auto *out = scope.FindVar(Output(kOutput));
+    PADDLE_ENFORCE_NOT_NULL(out);
+    auto *out_tensor = out->GetMutable<framework::LoDTensor>();
+
+    out_tensor->Resize({1});
+    out_tensor->mutable_data<bool>(platform::CPUPlace())[0] =
+        framework::product(tensor.dims()) == 0;
+  }
+};
+
+class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  IsEmptyOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kInput, "(Tensor) Tensor which is to be checked.");
+    AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not.");
+    AddComment(R"DOC(
+IsEmpty Operator which checks whether a tensor is empty.
+
+It will just return product(tensor.ddims()) > 0;
+              )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(is_empty, paddle::operators::IsEmptyOp,
+                             paddle::operators::IsEmptyOpProtoMaker);
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..974ee404f8364ed66e9e213f857ea89993e1d6af
--- /dev/null
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/l1_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class L1NormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
+
+    ctx->SetOutputDim("Out", {1});
+  }
+};
+
+class L1NormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) should be not null.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+class L1NormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  L1NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) The input of l1_norm op.");
+    AddOutput("Out", "(Scalar) The output of l1_norm op.");
+    AddComment(R"DOC(
+L1 Norm Operator.
+
+Computes the L1 norm of a tensor.
+
+$$Out = \sum{|X|}$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, l1_norm_grad,
+            ops::L1NormGradOp);
+REGISTER_OP_CPU_KERNEL(
+    l1_norm, ops::L1NormKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    l1_norm_grad,
+    ops::L1NormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/l1_norm_op.cu b/paddle/fluid/operators/l1_norm_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5e9e864a346298a670db63c664491c336a9bd36a
--- /dev/null
+++ b/paddle/fluid/operators/l1_norm_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/l1_norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    l1_norm, ops::L1NormKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    l1_norm_grad,
+    ops::L1NormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/l1_norm_op.h b/paddle/fluid/operators/l1_norm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ddf2ac6a9046d4d8c2130b459f2385ef4e1301a
--- /dev/null
+++ b/paddle/fluid/operators/l1_norm_op.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+// Out = sum(abs(X))
+template <typename DeviceContext, typename T>
+class L1NormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const framework::Tensor *X = context.Input<framework::Tensor>("X");
+    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
+    Out->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto out = framework::EigenScalar<T>::From(*Out);
+    auto &place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    out.device(place) = x.abs().sum();
+  }
+};
+
+// dX = dout * sign(X)
+template <typename DeviceContext, typename T>
+class L1NormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const framework::Tensor *x = context.Input<framework::Tensor>("X");
+    const framework::Tensor *d_out =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    PADDLE_ENFORCE(d_out->numel() == 1, "L1 Norm Gradient should be scalar");
+    framework::Tensor *dx =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(context.GetPlace());
+
+    auto x_eigen = framework::EigenVector<T>::Flatten(*x);
+    auto d_out_eigen = framework::EigenVector<T>::Flatten(*d_out);
+    auto dx_eigen = framework::EigenVector<T>::Flatten(*dx);
+    auto &place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    Eigen::DSizes<int, 1> x_dsize(x->numel());
+    dx_eigen.device(place) = d_out_eigen.broadcast(x_dsize) * x_eigen.sign();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c018965beefb362ae845d132e34ded1bb2911629
--- /dev/null
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -0,0 +1,128 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/label_smooth_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LabelSmoothOp : public framework::OperatorWithKernel {
+ public:
+  LabelSmoothOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LabelSmoothOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LabelSmoothOp should not be null.");
+    auto in_dims = ctx->GetInputDim("X");
+    if (ctx->HasInput("PriorDist")) {
+      auto noise_dims = ctx->GetInputDim("PriorDist");
+      auto noise_numel = paddle::framework::product(noise_dims);
+      PADDLE_ENFORCE(
+          in_dims[1] == noise_numel,
+          "The number of elements in Input(PriorDist) must be equal to the "
+          "dimension of each label.");
+    }
+    ctx->ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("Out", in_dims);
+  }
+};
+
+class LabelSmoothOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LabelSmoothOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor) The input labels of LabelSmooth operator. This "
+             "input can be batched labels in one-hot encoding or output from "
+             "softmax, with shape [N x K], where N is the batch size and K is "
+             "the number of classes");
+    AddInput("PriorDist",
+             "(Tensor, optional)"
+             "The prior distribution to be added to the smoothed label. It is "
+             "fixed during training and the number of elements should be equal "
+             "to the dimension K of each label. Default is uniform "
+             "distribution and each element will be set to 1/K if not provided "
+             "in input.")
+        .AsDispensable();
+    AddOutput("Out",
+              "(loDTensor) The smoothed label of LabelSmooth operator. It has"
+              "the same shape and LoD with the Input(LoDTensor).");
+    AddAttr<float>("epsilon",
+                   "(float, default 0.0f)"
+                   "The smoothing parameter of LabelSmooth operator.")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+LabelSmooth Operator.
+
+Label smoothing is a mechanism to regularize the classifier layer. In machine 
+learning, optimizing the log-likelihood of the correct label directly may 
+cause two problems. First, it may result in overfitting: if the model learns 
+to assign full probability to the ground-truth label for each training example,
+it is not guaranteed to generalize. Second, it encourages the differences 
+between the largest logit and all others to become large, reducing the ability 
+of the model to adapt. Label smoothing is proposed to encourage the model to 
+be less confident, which replaces the ground-truth label $y$ with the weighted 
+sum of itself and some fixed distribution $\mu$, i.e.
+
+$$
+    \tilde{y} = (1 - \epsilon) * y + \epsilon * \mu,
+$$
+
+where $(1 - \epsilon)$ and $\epsilon$ are the weights respectively, and 
+$\tilde{y}$ is the smoothed label. Usually uniform distribution is used for 
+$\mu$. This change in the ground-truth label is called label-smoothing 
+regularization or LSR.
+
+See more details about label smoothing in https://arxiv.org/abs/1512.00567.
+
+)DOC");
+  }
+};
+
+class LabelSmoothGradOp : public framework::OperatorWithKernel {
+ public:
+  LabelSmoothGradOp(const std::string &type,
+                    const framework::VariableNameMap &inputs,
+                    const framework::VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OP(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
+            label_smooth_grad, ops::LabelSmoothGradOp);
+REGISTER_OP_CPU_KERNEL(
+    label_smooth,
+    ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    label_smooth_grad,
+    ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/label_smooth_op.cu b/paddle/fluid/operators/label_smooth_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4a40a4e9ec82199afae3ae77bb2296a2fa95b0a5
--- /dev/null
+++ b/paddle/fluid/operators/label_smooth_op.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/label_smooth_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    label_smooth,
+    ops::LabelSmoothKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LabelSmoothKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    label_smooth_grad,
+    ops::LabelSmoothGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LabelSmoothGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/label_smooth_op.h b/paddle/fluid/operators/label_smooth_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..15752377f663fcb526b8306158e1e90d743c6cb6
--- /dev/null
+++ b/paddle/fluid/operators/label_smooth_op.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class LabelSmoothKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out_t = ctx.Output<framework::LoDTensor>("Out");
+    auto* in_t = ctx.Input<framework::LoDTensor>("X");
+    auto* dist_t = ctx.Input<framework::Tensor>("PriorDist");
+    auto label_dim = in_t->dims()[1];
+    out_t->mutable_data<T>(ctx.GetPlace());
+
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto out = framework::EigenVector<T>::Flatten(*out_t);
+    auto in = framework::EigenVector<T>::Flatten(*in_t);
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    if (dist_t) {
+      auto dist = framework::EigenVector<T>::Flatten(*dist_t);
+      out.device(dev) =
+          static_cast<T>(1 - epsilon) * in +
+          epsilon * dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
+    } else {
+      out.device(dev) = static_cast<T>(1 - epsilon) * in +
+                        static_cast<T>(epsilon / label_dim);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LabelSmoothGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_in_t = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    d_in_t->mutable_data<T>(ctx.GetPlace());
+
+    auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
+    auto d_in = framework::EigenVector<T>::Flatten(*d_in_t);
+
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    d_in.device(dev) = static_cast<T>(1 - epsilon) * d_out;
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..60e37ed01b3cad428dc0184634b4d36c9f24f9c5
--- /dev/null
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -0,0 +1,173 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/layer_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+class LayerNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"),
+                   "Output(Y) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Mean"),
+                   "Output(Mean) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Variance"),
+                   "Output(Variance) of LayerNormOp should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto begin_norm_axis = ctx->Attrs().Get<int>("begin_norm_axis");
+    PADDLE_ENFORCE_LT(begin_norm_axis, x_dim.size(),
+                      "'begin_norm_axis' must be less than the rank of X.");
+
+    auto matrix_dim = framework::flatten_to_2d(x_dim, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    if (ctx->HasInput("Scale")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right);
+    }
+    if (ctx->HasInput("Bias")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right);
+    }
+
+    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("Mean", {left});
+    ctx->SetOutputDim("Variance", {left});
+    ctx->ShareLoD("X", "Y");
+  }
+};
+
+class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LayerNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) The input tensor.");
+    AddInput("Scale",
+             "(Tensor, optional) Scale is a 1-dimensional tensor of size "
+             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
+             "It is applied to the output.")
+        .AsDispensable();
+    AddInput("Bias",
+             "(Tensor, optional) Bias is a 1-dimensional tensor of size "
+             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
+             "It is applied to the output.")
+        .AsDispensable();
+    AddOutput("Y", "(LoDTensor) Result after normalization.");
+    AddOutput("Mean", "(Tensor) Mean of the current mini batch.")
+        .AsIntermediate();
+    AddOutput("Variance", "(Tensor) Variance of the current mini batch.")
+        .AsIntermediate();
+
+    AddAttr<float>("epsilon",
+                   "(float, default 1e-5) Constant for "
+                   "numerical stability")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
+                         "'epsilon' should be between 0.0 and 0.001.");
+        });
+    AddAttr<int>("begin_norm_axis",
+                 "(int default:1), the "
+                 "axis of `begin_norm_axis ... Rank(X) - 1` will be "
+                 "normalized. `begin_norm_axis` splits the tensor(`X`) to a "
+                 "matrix [N,H].")
+        .SetDefault(1)
+        .AddCustomChecker([](const int &begin_norm_axis) {
+          PADDLE_ENFORCE_GT(begin_norm_axis, 0,
+                            "'begin_norm_axis' should be greater than zero.");
+        });
+
+    AddComment(R"DOC(
+Layer Normalization.
+Layer Norm has been implemented as discussed in the paper:
+https://arxiv.org/abs/1607.06450
+...
+)DOC");
+  }
+};
+
+class LayerNormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // check input
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Mean"),
+                   "Input(Mean) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Variance"),
+                   "Input(Variance) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) of LayerNormOp should not be null.");
+
+    // check output
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+      ctx->SetOutputDim(framework::GradVarName("Scale"),
+                        ctx->GetInputDim("Scale"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+    if (var == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    const Tensor *t = nullptr;
+    if (var->IsType<Tensor>()) {
+      t = &var->Get<Tensor>();
+    } else if (var->IsType<LoDTensor>()) {
+      t = &var->Get<LoDTensor>();
+    }
+    if (t == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    return framework::OpKernelType(framework::ToDataType(t->type()),
+                                   ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
+            layer_norm_grad, ops::LayerNormGradOp);
+REGISTER_OP_CPU_KERNEL(
+    layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    layer_norm_grad,
+    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..aa54fd54155ce19298ad9f80c930ad08e542d71c
--- /dev/null
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/layer_norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    layer_norm,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    layer_norm_grad,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..60c0b07add172520dc8062d3d5e8e4e69758e1f1
--- /dev/null
+++ b/paddle/fluid/operators/layer_norm_op.h
@@ -0,0 +1,238 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct SubAndSquareFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
+};
+
+template <typename T>
+struct DivAndSqrtFunctor {
+  explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; }
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return a / (sqrt(b + epsilon_));
+  }
+
+ private:
+  T epsilon_;
+};
+
+template <typename T>
+struct MulFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
+};
+
+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
+
+template <typename T>
+struct SubFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
+};
+
+template <typename T>
+struct MulInvVarFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return a * std::sqrt(1.0 / b);
+  }
+};
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+template <typename DeviceContext, typename T>
+class LayerNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto *scale = ctx.Input<Tensor>("Scale");
+    auto *bias = ctx.Input<Tensor>("Bias");
+    auto x = *ctx.Input<Tensor>("X");
+
+    auto *y = ctx.Output<Tensor>("Y");
+    auto *mean = ctx.Output<Tensor>("Mean");
+    auto *var = ctx.Output<Tensor>("Variance");
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+
+    const auto x_dims = x.dims();
+
+    y->mutable_data<T>(ctx.GetPlace());
+    mean->mutable_data<T>(ctx.GetPlace());
+    var->mutable_data<T>(ctx.GetPlace());
+
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    framework::DDim matrix_shape({left, right});
+
+    x.Resize(matrix_shape);
+    Tensor out;
+    out.ShareDataWith(*y);
+    out.Resize(matrix_shape);
+
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    math::RowwiseMean<DeviceContext, T> row_mean;
+
+    // get mean
+    row_mean(dev_ctx, x, mean);
+
+    // get variance
+    ElementwiseComputeEx<SubAndSquareFunctor<T>, DeviceContext, T>(
+        ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor<T>(), &out);
+    row_mean(dev_ctx, out, var);
+
+    // get x_norm
+    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+        ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &out);
+    ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
+        ctx, &out, var, /*axis*/ 0,
+        DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &out);
+
+    if (scale) {
+      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+          ctx, &out, scale, /*axis*/ 1, MulFunctor<T>(), &out);
+    }
+    if (bias) {
+      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
+          ctx, &out, bias, /*axis*/ 1, AddFunctor<T>(), &out);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LayerNormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto x = *ctx.Input<Tensor>("X");
+    auto *y = ctx.Input<Tensor>("Y");
+    auto *mean = ctx.Input<Tensor>("Mean");
+    auto *var = ctx.Input<Tensor>("Variance");
+    auto *scale = ctx.Input<Tensor>("Scale");
+    auto *bias = ctx.Input<Tensor>("Bias");
+    auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    const auto &x_dims = x.dims();
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    framework::DDim matrix_shape({left, right});
+
+    d_y.Resize(matrix_shape);
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    math::ColwiseSum<DeviceContext, T> colwise_sum;
+
+    Tensor temp;
+    Tensor temp_norm;
+    if (d_scale || d_x) {
+      x.Resize(matrix_shape);
+      temp.mutable_data<T>(matrix_shape, ctx.GetPlace());
+
+      if (!(bias && scale)) {
+        temp_norm.ShareDataWith(*y);
+        temp_norm.Resize(matrix_shape);
+      } else {
+        temp_norm.mutable_data<T>(matrix_shape, ctx.GetPlace());
+        // get x_norm
+        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+            ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &temp_norm);
+        ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
+            ctx, &temp_norm, var, /*axis*/ 0,
+            DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &temp_norm);
+      }
+    }
+
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      colwise_sum(dev_ctx, d_y, d_bias);
+    }
+    if (d_scale) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+          ctx, &temp_norm, &d_y, /*axis*/ 0, MulFunctor<T>(), &temp);
+      colwise_sum(dev_ctx, temp, d_scale);
+    }
+
+    if (d_x) {
+      framework::DDim vec_shape({left});
+      d_x->mutable_data<T>(ctx.GetPlace());
+      auto dx_dim = d_x->dims();
+      Tensor temp_vec;
+      temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace());
+
+      math::RowwiseMean<DeviceContext, T> row_mean;
+
+      if (d_scale) {
+        // dy_dx
+        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+            ctx, &d_y, scale, /*axis*/ 1, MulFunctor<T>(), &temp);
+        framework::Copy(temp, ctx.GetPlace(), ctx.device_context(), d_x);
+
+        // dy_dmean_dx
+        row_mean(dev_ctx, temp, &temp_vec);
+        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
+
+        // dy_var_dx
+        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+            ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
+      } else {
+        // dy_dx
+        framework::Copy(d_y, ctx.GetPlace(), ctx.device_context(), d_x);
+
+        // dy_dmean_dx
+        row_mean(dev_ctx, d_y, &temp_vec);
+        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
+
+        // dy_var_dx
+        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+            ctx, &d_y, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
+      }
+      // dy_var_dx
+      row_mean(dev_ctx, temp, &temp_vec);
+      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+          ctx, &temp_norm, &temp_vec, /*axis*/ 0, MulFunctor<T>(), &temp);
+      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+          ctx, d_x, &temp, /*axis*/ 0, SubFunctor<T>(), d_x);
+
+      ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
+          ctx, d_x, var, /*axis*/ 0,
+          DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), d_x);
+      d_x->Resize(dx_dim);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3e1dfa494872b6f187ee0b3cca399308a1cab42a
--- /dev/null
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -0,0 +1,269 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/linear_chain_crf_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LinearChainCRFOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Emission",
+             "(LoDTensor, default LoDTensor<float>) "
+             "A 2-D LoDTensor with shape [N x D], where N is the size of the "
+             "mini-batch and D is the total tag number. The unscaled emission "
+             "weight matrix for the linear chain CRF. ");
+    AddInput("Transition",
+             "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
+             "[(D + 2) x D]. The learnable parameter for the linear_chain_crf "
+             "operator. See more details in the operator's comments.");
+    AddInput("Label",
+             "(LoDTensor, default LoDTensor<int64_t>) A LoDTensor with shape "
+             "[N x 1], where N is the total element number in a mini-batch. "
+             "The ground truth.");
+    AddOutput(
+        "Alpha",
+        "(Tensor, default Tensor<float>) A 2-D Tensor with shape [N x D]. "
+        "The forward vectors for the entire batch. Denote it as $\alpha$. "
+        "$\alpha$ is a memo table used to calculate the normalization "
+        "factor in CRF. $\alpha[k, v]$ stores the unnormalized "
+        "probabilites of all possible unfinished sequences of tags that end at "
+        "position $k$ with tag $v$. For each $k$, "
+        "$\alpha[k, v]$ is a vector of length $D$ with a component for "
+        "each tag value $v$. This vector is called a forward vecotr and "
+        "will also be used in backward computations.")
+        .AsIntermediate();
+    AddOutput(
+        "EmissionExps",
+        "(Tensor, default Tensor<float>) A 2-D Tensor with shape [N x D]. "
+        "The exponentials of Input(Emission). This is an intermediate "
+        "computational result in forward computation, and will be reused in "
+        "backward computation.")
+        .AsIntermediate();
+    AddOutput(
+        "TransitionExps",
+        "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
+        "[(D + 2) x D]. The exponentials of Input(Transition). This is an "
+        "intermediate computational result in forward computation, and "
+        "will be reused in backward computation.")
+        .AsIntermediate();
+    AddOutput(
+        "LogLikelihood",
+        "(Tensor, default Tensor<float>) The logarithm of the conditional "
+        "likelihood of each training sample in a mini-batch. This is a 2-D "
+        "tensor with shape [S x 1], where S is the sequence number in a "
+        "mini-batch. Note: S is equal to the sequence number in a mini-batch. "
+        "The output is no longer a LoDTensor.");
+    AddComment(R"DOC(
+LinearChainCRF Operator.
+
+Conditional Random Field defines an undirected probabilistic graph with nodes
+denoting random variables and edges denoting dependencies between these
+variables. CRF learns the conditional probability $P(Y|X)$, where
+$X = (x_1, x_2, ... , x_n)$ are structured inputs and
+$Y = (y_1, y_2, ... , y_n)$ are labels for the inputs.
+
+Linear chain CRF is a special case of CRF that is useful for sequence labeling
+task. Sequence labeling tasks do not assume a lot of conditional
+independences among inputs. The only constraint they impose is that the input
+and output must be linear sequences. Thus, the graph of such a CRF is a simple
+chain or a line, which results in the linear chain CRF.
+
+This operator implements the Forward-Backward algorithm for the linear chain
+CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
+http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.
+
+Equation:
+1. Denote Input(Emission) to this operator as $x$ here.
+2. The first D values of Input(Transition) to this operator are for starting
+weights, denoted as $a$ here.
+3. The next D values of Input(Transition) of this operator are for ending
+weights, denoted as $b$ here.
+4. The remaning values of Input(Transition) are for transition weights,
+denoted as $w$ here.
+5. Denote Input(Label) as $s$ here.
+
+The probability of a sequence $s$ of length $L$ is defined as:
+$$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L}
+                + \sum_{l=1}^L x_{s_l}
+                + \sum_{l=2}^L w_{s_{l-1},s_l})$$
+
+where $Z$ is a normalization value so that the sum of $P(s)$ over
+all possible sequences is 1, and $x$ is the emission feature weight
+to the linear chain CRF.
+
+Finally, the linear chain CRF operator outputs the logarithm of the conditional
+likelihood of each training sample in a mini-batch.
+
+NOTE:
+1. The feature function for a CRF is made up of the emission features and the
+transition features. The emission feature weights are NOT computed in
+this operator. They MUST be computed first before this operator is called.
+
+2. Because this operator performs global normalization over all possible
+sequences internally, it expects UNSCALED emission feature weights.
+Please do not call this op with the emission feature being output of any
+nonlinear activation.
+
+3. The 2nd dimension of Input(Emission) MUST be equal to the tag number.
+
+)DOC");
+  }
+};
+
+class LinearChainCRFOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Emission"),
+                   "Input(Emission) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Transition"),
+                   "Input(Transition) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("Alpha"),
+                   "Output(Alpha) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("EmissionExps"),
+                   "Output(EmissionExps) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("TransitionExps"),
+                   "Output(TransitionExps) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("LogLikelihood"),
+                   "Output(LogLikelihood) should be not null.");
+
+    auto emission_dims = ctx->GetInputDim("Emission");
+    PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL,
+                      "The Input(Emission) should be a 2-D tensor.");
+    PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed.");
+
+    auto transition_dims = ctx->GetInputDim("Transition");
+    PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL,
+                      "The Input(Transition) should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(
+        transition_dims[0] - 2, transition_dims[1],
+        "An invalid dimension for the Input(Transition), which should "
+        "be a 2-D tensor with shape [(D + 2) x D].");
+    PADDLE_ENFORCE_EQ(
+        emission_dims[1], transition_dims[1],
+        "The 2nd dimension of the Input(Emission) and the Input(Transition) "
+        "should be equal to the tag number.");
+
+    auto label_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
+                   "The Input(Label) should be a 2-D tensor with the 2nd "
+                   "dimensions fixed to 1.");
+    PADDLE_ENFORCE_EQ(
+        emission_dims[0], label_dims[0],
+        "The height of Input(Emission) and the height of Input(Label) "
+        "should be the same.");
+
+    ctx->SetOutputDim("Alpha", emission_dims);
+    ctx->SetOutputDim("EmissionExps", emission_dims);
+    ctx->SetOutputDim("TransitionExps", transition_dims);
+    // TODO(caoying) This is tricky. The 1st dimension of Output(LogLikelihood)
+    // is the sequence number in a mini-batch. The dimension set here should be
+    // resized to its correct size in the function Compute. Fix this once we can
+    // get LoD information in the InferShape interface.
+    ctx->SetOutputDim("LogLikelihood", {emission_dims[0], 1});
+  }
+
+ protected:
+  // Explicitly set that the data type of computation kernel of linear_chain_crf
+  // is determined by its input "Emission".
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
+        platform::CPUPlace());
+  }
+};
+
+class LinearChainCRFGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("EmissionExps"),
+                   "Input(EmissionExps) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("TransitionExps"),
+                   "Input(TransitionExps) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("LogLikelihood")),
+                   "Input(LogLikelihood@GRAD) shoudl be not null.");
+
+    auto emission_exps_dims = ctx->GetInputDim("EmissionExps");
+    PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2UL,
+                      "The Input(EmissionExps) should be a 2-D tensor.");
+    PADDLE_ENFORCE(emission_exps_dims[0],
+                   "An empty mini-batch is not allowed.");
+
+    auto transition_exps_dims = ctx->GetInputDim("TransitionExps");
+    PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2UL,
+                      "The Input(TransitionExps) should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(
+        transition_exps_dims[0] - 2, transition_exps_dims[1],
+        "An invalid dimension for the Input(TransitionExps), which should "
+        "be a 2-D tensor with shape [(D + 2) x D].");
+    PADDLE_ENFORCE_EQ(
+        emission_exps_dims[1], transition_exps_dims[1],
+        "The 2nd dimension of the Input(EmissionExps) and the "
+        "Input(TransitionExps) should be equal to the tag number.");
+
+    auto label_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
+                   "The Input(Label) should be a 2-D tensor with the 2nd "
+                   "dimensions fixed to 1.");
+    PADDLE_ENFORCE_EQ(
+        emission_exps_dims[0], label_dims[0],
+        "The height of Input(EmissionExps) and the height of Input(Label) "
+        "should be the same.");
+
+    if (ctx->HasOutput(framework::GradVarName("Emission"))) {
+      ctx->SetOutputDim(framework::GradVarName("Emission"), emission_exps_dims);
+    }
+    if (ctx->HasOutput(framework::GradVarName("Transition"))) {
+      ctx->SetOutputDim(framework::GradVarName("Transition"),
+                        transition_exps_dims);
+    }
+  }
+
+ protected:
+  // Explicitly set that the data type of output of the linear_chain_crf_grad
+  // operator is determined by its input: gradients of LogLikelihood.
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<LoDTensor>(framework::GradVarName("LogLikelihood"))
+                ->type()),
+        platform::CPUPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(linear_chain_crf, ops::LinearChainCRFOp, ops::LinearChainCRFOpMaker,
+            linear_chain_crf_grad, ops::LinearChainCRFGradOp);
+REGISTER_OP_CPU_KERNEL(
+    linear_chain_crf,
+    ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    linear_chain_crf_grad,
+    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUDeviceContext,
+                                    double>);
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cu b/paddle/fluid/operators/linear_chain_crf_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6e04e76eebc71def814fed65a469ef5f9f1b16b0
--- /dev/null
+++ b/paddle/fluid/operators/linear_chain_crf_op.cu
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/linear_chain_crf_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    linear_chain_crf,
+    ops::LinearChainCRFOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LinearChainCRFOpKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    linear_chain_crf_grad,
+    ops::LinearChainCRFGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LinearChainCRFGradOpKernel<paddle::platform::CUDADeviceContext,
+                                    double>);
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..15b64c09bf366b356683c47c82a6dbf9529d9b58
--- /dev/null
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -0,0 +1,353 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+static inline T NormalizeL1(T* x, size_t len) {
+  T sum = 0.;
+  for (size_t i = 0; i < len; ++i) sum += x[i];
+  // (This comment is from the old LinearChainCRFLayer.)
+  // Right now, we just bet that sum won't be zero. If this really happens, we
+  // will figure out what should be done then.
+  PADDLE_ENFORCE(sum,
+                 "The unnormalized probabilities of all possible unfinished "
+                 "sequences must be greater than 0.");
+  T s = 1. / sum;
+  for (size_t i = 0; i < len; ++i) x[i] *= s;
+  return sum;
+}
+
+template <typename T>
+struct ScalarMul {
+  explicit ScalarMul(const T& scalar) : scalar(scalar) {}
+  T operator()(const T& val) const { return val * scalar; }
+
+  T scalar;
+};
+
+using framework::LoDTensor;
+using framework::LoD;
+using framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class LinearChainCRFOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // TODO(caoying) The checks related to LoD information should be
+    // moved into InferShape once after the InferShape is refactored.
+    PADDLE_ENFORCE_EQ(ctx.Input<LoDTensor>("Emission")->NumLevels(), 1UL,
+                      "The Input(Emission) should be a sequence.");
+    PADDLE_ENFORCE_EQ(ctx.Input<LoDTensor>("Label")->NumLevels(), 1UL,
+                      "The Input(Label) should be a sequence.");
+    auto in_lod = ctx.Input<LoDTensor>("Label")->lod();
+    PADDLE_ENFORCE(in_lod.size(), "Input(Label) must be a sequence.");
+    const size_t level = 0;
+    const size_t seq_num = in_lod[level].size() - 1;
+
+    const LoDTensor* emission_weights = ctx.Input<LoDTensor>("Emission");
+    const Tensor* transition_weights = ctx.Input<Tensor>("Transition");
+    const LoDTensor* label = ctx.Input<LoDTensor>("Label");
+
+    Tensor* emission_exps = ctx.Output<Tensor>("EmissionExps");
+    Tensor* transition_exps = ctx.Output<Tensor>("TransitionExps");
+    Tensor* alpha = ctx.Output<Tensor>("Alpha");
+    Tensor* ll = ctx.Output<Tensor>("LogLikelihood");
+
+    // Because the computation codes only runs on CPU, here the memory for all
+    // the outputs is FIXED to be allocated on the CPU memory.
+    emission_exps->mutable_data<T>(platform::CPUPlace());
+    transition_exps->mutable_data<T>(platform::CPUPlace());
+    alpha->mutable_data<T>(platform::CPUPlace());
+
+    // Resize the output tensor to its correct dimension.
+    ll->Resize({static_cast<int>(seq_num), 1});
+    ll->mutable_data<T>(platform::CPUPlace());
+
+    // Now, all the inputs and outputs should be on the CPU memory.
+    auto emission_dims = emission_weights->dims();
+    const size_t batch_size = emission_dims[0];
+    const size_t tag_num = emission_dims[1];
+
+    Tensor emission_row_max;
+    emission_row_max.mutable_data<T>(
+        framework::make_ddim({static_cast<int64_t>(batch_size), 1}),
+        platform::CPUPlace());
+
+    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
+    auto x = EigenMatrix<T>::From(*emission_weights);
+    auto x_row_max = EigenMatrix<T>::From(emission_row_max);
+    x_row_max.device(place) =
+        x.maximum(Eigen::DSizes<int, 1>(1))
+            .reshape(Eigen::DSizes<int, 2>(int(batch_size), 1));
+
+    auto x_exps = EigenMatrix<T>::From(*emission_exps);
+    x_exps.device(place) =
+        (x - x_row_max.broadcast(Eigen::DSizes<int, 2>(1, tag_num))).exp();
+
+    auto w = EigenMatrix<T>::From(*transition_weights);
+    auto w_exps = EigenMatrix<T>::From(*transition_exps);
+    w_exps.device(place) = w.exp();
+
+    T* log_likelihood = ll->data<T>();
+    for (size_t i = 0; i < seq_num; ++i) {
+      int start_pos = static_cast<int>(in_lod[level][i]);
+      int end_pos = static_cast<int>(in_lod[level][i + 1]);
+      if (end_pos == start_pos) {
+        // If an empty input sequence is given, pad 0 for its cost.
+        log_likelihood[i] = 0.;
+        continue;
+      }
+
+      const Tensor one_seq = emission_weights->Slice(start_pos, end_pos);
+      Tensor one_seq_row_max = emission_row_max.Slice(start_pos, end_pos);
+      Tensor one_seq_exps = emission_exps->Slice(start_pos, end_pos);
+      const Tensor one_seq_label = label->Slice(start_pos, end_pos);
+      Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos);
+
+      log_likelihood[i] = ForwardOneSequence(
+          one_seq, one_seq_row_max, one_seq_exps, *transition_weights,
+          *transition_exps, one_seq_label, &one_seq_alpha);
+    }
+  };
+
+ private:
+  T ForwardOneSequence(const Tensor& emission, const Tensor& emission_row_max,
+                       const Tensor& emission_exps, const Tensor& trans_weights,
+                       const Tensor& trans_weight_exps, const Tensor& label,
+                       Tensor* alpha) const {
+    const T* x = emission.data<T>();
+    const T* x_row_max = emission_row_max.data<T>();
+    const T* x_exps = emission_exps.data<T>();
+    const T* w = trans_weights.data<T>();
+    const T* w_exps = trans_weight_exps.data<T>();
+    T* alpha_value = alpha->data<T>();
+
+    auto x_dims = emission.dims();
+    const size_t seq_length = x_dims[0];
+    const size_t tag_num = x_dims[1];
+    // The 1st row of w are transition weights for start mask.
+    // The 2nd row of w are transition weights for end mask.
+    // Transition weights between other tags begin from the 3rd row of w.
+    const size_t state_trans_base_idx = 2;
+
+    for (size_t i = 0; i < tag_num; ++i) {
+      alpha_value[i] = w_exps[i] * x_exps[i];
+    }
+    T ll = -x_row_max[0] - std::log(NormalizeL1<T>(alpha_value, tag_num));
+
+    for (size_t k = 1; k < seq_length; ++k) {
+      for (size_t i = 0; i < tag_num; ++i) {
+        T sum = 0.;
+        for (size_t j = 0; j < tag_num; ++j) {
+          sum += alpha_value[(k - 1) * tag_num + j] *  // (*)
+                 w_exps[(j + state_trans_base_idx) * tag_num + i];
+        }
+        alpha_value[k * tag_num + i] = x_exps[k * tag_num + i] * sum;
+      }
+      // NormalizeL1 is to avoid underflow or overflow at (*).
+      ll -= x_row_max[k] +
+            std::log(NormalizeL1<T>(alpha_value + k * tag_num, tag_num));
+    }
+    T sum = 0.;
+    for (size_t i = 0; i < tag_num; ++i) {
+      sum += alpha_value[(seq_length - 1) * tag_num + i] * w_exps[tag_num + i];
+    }
+    ll -= std::log(sum);
+    // Now ll is equal to -log(Z).
+
+    const int64_t* lbl = label.data<int64_t>();
+    PADDLE_ENFORCE_LT(
+        static_cast<size_t>(*std::max_element(lbl, lbl + seq_length)), tag_num,
+        "An invalid tag label that execesses the largest tag number.");
+
+    // Calculate the nominator part, which depends on the label sequence.
+    ll += w[lbl[0]] /*start transition*/ + x[lbl[0]] +
+          w[tag_num + lbl[seq_length - 1]] /*end transition*/;
+    for (size_t k = 1; k < seq_length; ++k) {
+      ll += x[k * tag_num + lbl[k]] +
+            w[(lbl[k - 1] + state_trans_base_idx) * tag_num + lbl[k]];
+    }
+    return -ll;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const size_t level = 0;  // currently, only support sequence.
+    auto lod = ctx.Input<LoDTensor>("Label")->lod();
+    PADDLE_ENFORCE(lod.size(), "Input(Label) must be a sequence.");
+
+    const Tensor* label = ctx.Input<LoDTensor>("Label");
+    const Tensor* emission_exps = ctx.Input<Tensor>("EmissionExps");
+    const Tensor* transition_exps = ctx.Input<Tensor>("TransitionExps");
+    const Tensor* alpha = ctx.Input<Tensor>("Alpha");
+    const T* ll_grad =
+        ctx.Input<Tensor>(framework::GradVarName("LogLikelihood"))->data<T>();
+
+    Tensor* emission_grad =
+        ctx.Output<Tensor>(framework::GradVarName("Emission"));
+    Tensor* transition_grad =
+        ctx.Output<Tensor>(framework::GradVarName("Transition"));
+
+    // TODO(caoying) Fix this constraint. When the Input(Emission) is from the
+    // data reader operator, it can have no gradients.
+    PADDLE_ENFORCE(emission_grad, "Output(Emission@Grad) should not be null.");
+    emission_grad->mutable_data<T>(platform::CPUPlace());
+    if (transition_grad) {
+      transition_grad->mutable_data<T>(platform::CPUPlace());
+      math::set_constant(ctx.device_context(), transition_grad, 0.);
+    }
+    // Now, all the inputs and outputs should be on the CPU memory.
+
+    auto emission_dims = emission_exps->dims();
+    // Beta is the memo table used in dynamic programming to calculate the
+    // backwark vectors. For a backward vector i (the i-th row of beta), it
+    // captures the unnormalized probabilities of partial sequences starting
+    // at position i.
+    Tensor beta;
+    beta.mutable_data<T>(emission_dims, platform::CPUPlace());
+
+    for (size_t i = 0; i < lod[level].size() - 1; ++i) {
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+      if (end_pos == start_pos) continue;
+
+      const Tensor one_seq_emission_exps =
+          emission_exps->Slice(start_pos, end_pos);
+      const Tensor one_seq_label = label->Slice(start_pos, end_pos);
+      const Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos);
+      Tensor one_seq_beta = beta.Slice(start_pos, end_pos);
+      Tensor one_seq_emission_grad = emission_grad->Slice(start_pos, end_pos);
+
+      BackwardOneSequence(
+          ctx.template device_context<platform::CPUDeviceContext>(), ll_grad[i],
+          one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label,
+          &one_seq_beta, transition_grad, &one_seq_emission_grad);
+    }
+  };
+
+ private:
+  void BackwardOneSequence(const platform::CPUDeviceContext& ctx,
+                           const T ll_grad, const Tensor& emission_exps,
+                           const Tensor& transition_exps, const Tensor& alpha,
+                           const Tensor& label, Tensor* beta,
+                           Tensor* transition_grad,
+                           Tensor* emission_grad) const {
+    const T* w_exps = transition_exps.data<T>();
+    const T* x_exps = emission_exps.data<T>();
+    const int64_t* label_value = label.data<int64_t>();
+    T* beta_value = beta->data<T>();
+
+    auto x_dims = emission_exps.dims();
+    const size_t seq_length = x_dims[0];
+    const size_t tag_num = x_dims[1];
+    const size_t state_trans_base_idx = 2;
+
+    // Calculate the backward vectors: beta.
+    // First, calculate the initialition state.
+    for (size_t i = 0; i < tag_num; ++i) {
+      beta_value[(seq_length - 1) * tag_num + i] = w_exps[tag_num + i];
+    }
+    NormalizeL1<T>(beta_value + (seq_length - 1) * tag_num, tag_num);
+    for (int k = static_cast<int>(seq_length) - 2; k >= 0; --k) {
+      for (size_t i = 0; i < tag_num; ++i) {
+        T sum = 0.;
+        for (size_t j = 0; j < tag_num; ++j) {
+          sum += w_exps[(i + state_trans_base_idx) * tag_num + j] *  // (**)
+                 x_exps[(k + 1) * tag_num + j] *
+                 beta_value[(k + 1) * tag_num + j];
+        }
+        beta_value[k * tag_num + i] = sum;
+      }
+      // NormalizeL1 is to avoid underflow or overflow at (**).
+      NormalizeL1<T>(beta_value + k * tag_num, tag_num);
+    }
+
+    auto x_grad_mat = EigenMatrix<T>::From(*emission_grad);
+    auto alpha_mat = EigenMatrix<T>::From(alpha);
+    auto beta_mat = EigenMatrix<T>::From(*beta);
+
+    auto* place = ctx.eigen_device();
+    auto prob = alpha_mat * beta_mat;
+    auto row_sum = prob.sum(Eigen::DSizes<int, 1>(1))
+                       .reshape(Eigen::DSizes<int, 2>(seq_length, 1))
+                       .broadcast(Eigen::DSizes<int, 2>(1, tag_num));
+    x_grad_mat.device(*place) =
+        (prob / row_sum).unaryExpr(ScalarMul<T>(ll_grad));
+
+    for (size_t k = 0; k < seq_length; ++k) {
+      x_grad_mat(k, label_value[k]) -= static_cast<T>(ll_grad);
+    }
+
+    if (transition_grad) {
+      T* trans_grad = transition_grad->data<T>();
+      for (size_t k = 0; k < tag_num; ++k) {
+        // Do not multiply by the output gradient here, because x_grad_mat has
+        // alrealy done this.
+        trans_grad[k] += x_grad_mat(/*from start state*/ 0, k);
+        trans_grad[tag_num + k] +=
+            x_grad_mat(/*to end state*/ seq_length - 1, k);
+      }
+
+      auto x_exps_mat = EigenMatrix<T>::From(emission_exps);
+
+      // TODO(caoying): Fix this to avoid using this local variable if we can
+      // profile the training process.
+      Tensor tmp;
+      tmp.mutable_data<T>(beta->dims(), platform::CPUPlace());
+      auto tmp_mat = EigenMatrix<T>::From(tmp);
+      auto prob = beta_mat * x_exps_mat;
+      auto row_sum = prob.sum(Eigen::DSizes<int, 1>(1))
+                         .reshape(Eigen::DSizes<int, 2>(seq_length, 1))
+                         .broadcast(Eigen::DSizes<int, 2>(1, tag_num));
+      tmp_mat.device(*place) = prob / row_sum;
+
+      for (size_t k = 1; k < seq_length; ++k) {
+        T sum = 0.;
+        for (size_t i = 0; i < tag_num; ++i) {
+          for (size_t j = 0; j < tag_num; ++j) {
+            sum += w_exps[(i + state_trans_base_idx) * tag_num + j] *  // (**)
+                   alpha_mat(k - 1, i) * tmp_mat(k, j);
+          }
+        }
+        sum = 1. / sum;
+        for (size_t i = 0; i < tag_num; ++i) {
+          for (size_t j = 0; j < tag_num; ++j) {
+            trans_grad[(i + state_trans_base_idx) * tag_num + j] +=
+                sum * w_exps[(i + state_trans_base_idx) * tag_num + j] *
+                alpha_mat(k - 1, i) * tmp_mat(k, j) * ll_grad;
+          }
+        }
+        trans_grad[(label_value[k - 1] + state_trans_base_idx) * tag_num +
+                   label_value[k]] -= static_cast<T>(ll_grad);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a72708d9baad98b670b46da66f56bf79bad951be
--- /dev/null
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -0,0 +1,207 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdint.h>
+#include <sys/stat.h>
+#include <ostream>
+#include <thread>
+
+#include <unistd.h>
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/proto_desc.h"
+#include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/detail/simple_block_queue.h"
+#include "paddle/string/printf.h"
+
+namespace paddle {
+namespace operators {
+
+constexpr char kOptimizeBlock[] = "OptimizeBlock";
+
+void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
+  service->RunSyncUpdate();
+  VLOG(4) << "RunServer thread end";
+}
+
+static void CreateTensorFromMessageType(framework::Variable *var,
+                                        sendrecv::VarType var_type) {
+  if (var_type == sendrecv::VarType::LOD_TENSOR) {
+    var->GetMutable<framework::LoDTensor>();
+  } else if (var_type == sendrecv::VarType::SELECTED_ROWS) {
+    var->GetMutable<framework::SelectedRows>();
+  } else {
+    PADDLE_THROW(
+        "VariableMessage type %d is not in "
+        "[LoDTensor, SelectedRows]",
+        var_type);
+  }
+}
+
+class ListenAndServOp : public framework::OperatorBase {
+ public:
+  ListenAndServOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+    if (!rpc_service_) {
+      std::string endpoint = Attr<std::string>("endpoint");
+      rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+      server_thread_.reset(new std::thread(RunServer, rpc_service_));
+    }
+  }
+
+  void Stop() override {
+    detail::MessageWithName term_msg;
+    term_msg.first = LISTEN_TERMINATE_MESSAGE;
+    rpc_service_->Push(term_msg);
+    rpc_service_->ShutDown();
+    server_thread_->join();
+  }
+
+  std::string GetGradVarNameForTrainer(const std::string &varname) const {
+    if (grads_counter_.find(varname) == grads_counter_.end()) {
+      grads_counter_[varname] = 0;
+    }
+    return string::Sprintf("%s.trainer_%d", varname, grads_counter_[varname]++);
+  }
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    framework::Scope &recv_scope = scope.NewScope();
+
+    // FIXME(Yancey1989): initialize rpc server with lazy mode.
+    rpc_service_->SetScope(&recv_scope);
+    rpc_service_->SetDevCtx(&dev_ctx);
+    auto param_list = Attr<std::vector<std::string>>("ParamList");
+    auto grad_list = Attr<std::vector<std::string>>("GradList");
+    auto fan_in = Attr<int>("Fanin");
+
+    auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
+    auto *program = block->Program();
+    framework::Executor executor(dev_place);
+
+    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
+    bool exit_flag = false;
+    while (!exit_flag) {
+      // Get from multiple trainers, we don't care about the order in which
+      // the gradients arrives, just add suffix 0~n and merge the gradient.
+      rpc_service_->SetCond(0);
+      size_t recv_var_cnt = 0;
+      int batch_barrier = 0;
+      while (batch_barrier != fan_in) {
+        const detail::MessageWithName &v = rpc_service_->Get();
+        auto grad_var_name = v.first;
+        if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
+          LOG(INFO) << "received terminate message and exit";
+          exit_flag = true;
+          break;
+        } else if (grad_var_name == BATCH_BARRIER_MESSAGE) {
+          VLOG(3) << "recv batch barrier message";
+          batch_barrier++;
+          continue;
+        } else {
+          // receive a variable
+          recv_var_cnt++;
+          auto it =
+              std::find(grad_list.begin(), grad_list.end(), grad_var_name);
+          std::string param_var_name;
+          if (it != grad_list.end()) {
+            param_var_name = param_list[it - grad_list.begin()];
+          } else {
+            LOG(ERROR) << "grad has no paired param:" << grad_var_name;
+          }
+          VLOG(3) << "received grad: " << grad_var_name
+                  << " updating param: " << param_var_name;
+
+          if (fan_in > 1) {
+            grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
+          }
+          auto *var = recv_scope.FindVar(grad_var_name);
+          if (var == nullptr) {
+            LOG(ERROR) << "Can not find server side var: " << grad_var_name;
+            PADDLE_THROW("Can not find server side var");
+          }
+          detail::DeserializeFromMessage(v.second, dev_ctx, var);
+        }
+      }
+      VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier.";
+      // TODO(Yancey1989): merge SelectedRows variables here
+      if (exit_flag) {
+        rpc_service_->ShutDown();
+      }
+
+      try {
+        executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
+                     false /*create_local_scope*/, false /*create_vars*/);
+      } catch (std::exception &e) {
+        LOG(ERROR) << "run sub program error " << e.what();
+      }
+      rpc_service_->SetCond(1);
+      rpc_service_->WaitClientGet(recv_var_cnt);
+      grads_counter_.clear();
+    }  // while(true)
+  }
+
+ protected:
+  std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
+  std::shared_ptr<std::thread> server_thread_;
+  mutable std::unordered_map<std::string, int> grads_counter_;
+};
+
+class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ListenAndServOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddComment(R"DOC(
+ListenAndServ operator
+
+This operator will start a RPC server which can receive variables
+from send_op and send back variables to recv_op.
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string, default 127.0.0.1:6164)"
+                         "IP address to listen on.")
+        .SetDefault("127.0.0.1:6164")
+        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+    AddAttr<framework::BlockDesc *>(kOptimizeBlock,
+                                    "BlockID to run on server side.");
+    AddAttr<std::vector<std::string>>(
+        "ParamList", "type list of string",
+        "grad->param name mapping to find which parameters to optimize.")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>(
+        "GradList", "type list of string",
+        "grad->param name mapping to find which parameters to optimize.")
+        .SetDefault({});
+    AddAttr<int>("Fanin", "type int",
+                 "Number of trainers in the current cluster job")
+        .SetDefault(1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(listen_and_serv, ops::ListenAndServOp,
+                  ops::ListenAndServOpMaker);
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1948063d886b79964b1a52d9d82a8e7d2fb0d493
--- /dev/null
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fstream>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+class LoadCombineOp : public framework::OperatorBase {
+ public:
+  LoadCombineOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto filename = Attr<std::string>("file_path");
+
+    std::ifstream fin(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fin),
+                   "Cannot open file %s for load_combine op", filename);
+
+    auto out_var_names = Outputs("Out");
+    PADDLE_ENFORCE_GT(
+        static_cast<int>(out_var_names.size()), 0,
+        "The number of output variables should be greater than 0.");
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    for (size_t i = 0; i < out_var_names.size(); i++) {
+      auto *out_var = scope.FindVar(out_var_names[i]);
+
+      PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
+                     out_var_names[i]);
+
+      auto *tensor = out_var->GetMutable<framework::LoDTensor>();
+
+      // Error checking
+      PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s",
+                     filename);
+
+      // Get data from fin to tensor
+      DeserializeFromStream(fin, tensor, dev_ctx);
+
+      if (platform::is_gpu_place(place)) {
+        // copy CPU to GPU
+        framework::LoDTensor cpu_tensor;
+        cpu_tensor.ShareDataWith(*tensor);
+        cpu_tensor.set_lod(tensor->lod());
+
+        // reset tensor
+        out_var->Clear();
+        tensor = out_var->GetMutable<framework::LoDTensor>();
+        tensor->set_lod(cpu_tensor.lod());
+        Copy(cpu_tensor, place, dev_ctx, tensor);
+      }
+    }
+  }
+};
+
+class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoadCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput(
+        "Out",
+        "(vector) The output LoDTensors that will be read from the input file.")
+        .AsDuplicable();
+    AddAttr<std::string>("file_path",
+                         "(string) "
+                         "LoDTensors will be loaded from \"file_path\".")
+        .AddCustomChecker(
+            [](const std::string &path) { return !path.empty(); });
+    AddComment(R"DOC(
+LoadCombine Operator.
+
+LoadCombine operator loads LoDTensor variables from a file. The file should 
+contain one or more LoDTensors serialized using the SaveCombine operator. The 
+LoadCombine operator applies a deserialization strategy to appropriately load 
+the LodTensors, and this strategy complements the serialization strategy used 
+in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled
+with the SaveCombine operator, and can only deserialize one or more LoDTensors 
+that were saved using the SaveCombine operator.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(load_combine, ops::LoadCombineOp,
+                  ops::LoadCombineOpProtoMaker);
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c9bf5d72b234f96d9eb5a4c275737ac8c18cd63d
--- /dev/null
+++ b/paddle/fluid/operators/load_op.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fstream>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+class LoadOp : public framework::OperatorBase {
+ public:
+  LoadOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto filename = Attr<std::string>("file_path");
+    std::ifstream fin(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
+                   filename);
+
+    auto out_var_name = Output("Out");
+    auto *out_var = scope.FindVar(out_var_name);
+    PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
+                   out_var_name);
+
+    auto *tensor = out_var->GetMutable<framework::LoDTensor>();
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    DeserializeFromStream(fin, tensor, dev_ctx);
+
+    if (platform::is_gpu_place(place)) {
+      // copy CPU to GPU
+      framework::LoDTensor cpu_tensor;
+      cpu_tensor.ShareDataWith(*tensor);
+      cpu_tensor.set_lod(tensor->lod());
+
+      // reset tensor
+      out_var->Clear();
+      tensor = out_var->GetMutable<framework::LoDTensor>();
+      tensor->set_lod(cpu_tensor.lod());
+      Copy(cpu_tensor, place, dev_ctx, tensor);
+    }
+  }
+};
+
+class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "(Tensor) The tensor need to be loaded");
+    AddAttr<std::string>("file_path",
+                         "(string) "
+                         "Variable will be loaded from \"file_path\".")
+        .AddCustomChecker(
+            [](const std::string &path) { return !path.empty(); });
+    AddComment(R"DOC(
+Load Operator.
+
+Load operator will load a tensor variable from disk file.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(load, ops::LoadOp, ops::LoadOpProtoMaker);
diff --git a/paddle/fluid/operators/lod_array_length_op.cc b/paddle/fluid/operators/lod_array_length_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f11f5a89f5ad5b2f3deed905625aefa1e9d9935b
--- /dev/null
+++ b/paddle/fluid/operators/lod_array_length_op.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class LoDArrayLengthOp : public framework::OperatorBase {
+ public:
+  LoDArrayLengthOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
+    auto &out =
+        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    out.Resize({1});
+    auto cpu = platform::CPUPlace();
+    *out.mutable_data<int64_t>(cpu) = static_cast<int64_t>(x.size());
+  }
+};
+
+class LoDArrayLengthProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoDArrayLengthProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensorArray) The input tensor array.");
+    AddOutput("Out", "(Tensor) 1x1 CPU Tensor of length, int64_t");
+    AddComment(R"DOC(
+LoDArrayLength Operator.
+
+This operator obtains the length of lod tensor array:
+
+$$Out = len(X)$$
+
+NOTE: The output is a CPU Tensor since the control variable should be only in
+CPU and the length of LoDTensorArray should be used as control variables.
+
+)DOC");
+  }
+};
+
+class LoDArrayLengthInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"));
+    PADDLE_ENFORCE(context->HasOutput("Out"));
+    context->SetOutputDim("Out", {1});
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(lod_array_length, ops::LoDArrayLengthOp,
+                  ops::LoDArrayLengthInferShape, ops::LoDArrayLengthProtoMaker,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/lod_rank_table_op.cc b/paddle/fluid/operators/lod_rank_table_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b9426a9f8f0b0b3082667dc7a1414aceb824aca
--- /dev/null
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+
+class LoDRankTableOp : public framework::OperatorBase {
+ public:
+  LoDRankTableOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
+    VLOG(10) << "Level = " << static_cast<size_t>(Attr<int>("level"));
+    out->Reset(x.lod(), static_cast<size_t>(Attr<int>("level")));
+    VLOG(10) << Input("X") << "'s lod information is " << *out;
+  }
+};
+
+class LoDRankTableOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoDRankTableOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor) input lod tensor, must contain lod information.");
+    AddOutput("Out", "(LoDRankTable) The rank table of specific level.");
+    AddAttr<int>("level", "(int) the specific lod level to rank.")
+        .SetDefault(0)
+        .EqualGreaterThan(0);
+    AddComment(R"DOC(Create LoDRanTable by LoDTensor
+
+LoD Rank Table stores the `level` of `lod` which is ordered by sequence
+length in descending order. It is useful when implement dynamic RNN and is
+shared by dynamic RNN memory, dynamic RNN slice input and dynamic RNN slice
+output operators.
+)DOC");
+  }
+};
+
+class LoDRankTableInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"), "LoDRankTable must has input X");
+  }
+};
+
+class LoDRankTableInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    for (auto &o : op_desc.Output("Out")) {
+      block->FindRecursiveOrCreateVar(o).SetType(
+          framework::proto::VarDesc::LOD_RANK_TABLE);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(lod_rank_table, paddle::operators::LoDRankTableOp,
+                  paddle::operators::LoDRankTableOpProtoMaker,
+                  paddle::operators::LoDRankTableInferShape,
+                  paddle::operators::LoDRankTableInferVarType,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55ae71c1815470925b2bb153fc647b331dcc9ba4
--- /dev/null
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/lod_reset_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LoDResetOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // input check
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LoDResetOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LoDResetOp should not be null.");
+    // If target LoD is not set form Input(), then it must be set from Attr().
+    if (!ctx->HasInput("TargetLoD")) {
+      auto level0 = ctx->Attrs().Get<std::vector<int>>("target_lod");
+      PADDLE_ENFORCE(level0.size() > 1,
+                     "Target LoD is not found, should be set to be a valid one "
+                     "through Input() or Attr().");
+    }
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoDResetOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) The input tensor of lod_reset operator.");
+    AddInput("TargetLoD",
+             "(Tensor, optional) The target level 0 LoD from Input().")
+        .AsDispensable();
+    AddOutput("Out", "(LoDTensor) The output tensor of lod_reset operator.");
+    AddAttr<std::vector<int>>("target_lod",
+                              "The target level 0 LoD from Attr().")
+        .SetDefault(std::vector<int>{});
+    AddComment(R"DOC(LoDReset operator
+
+Reset LoD of Input(X) into a new one specified by Input(TargetLoD) or
+Attr(target_lod), or set LoD for Input(X) if it doesn't have one.
+Currently the lod_reset operator only supports the reset of level 0 LoD.
+At least one of Input(TargetLoD) and Attr(target_lod) must be set,
+and if both of them are set, Input(TargetLoD) will be chosen as the
+target LoD.
+
+An example:
+Given a float LoDTensor X with shape (6, 1), its transpose form represents
+
+    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+
+with LoD = [[0, 2, 5, 6]] and the three (transposed) sequences look like
+
+    [1.0, 2.0], [3.0, 4.0, 5.0], [6.0].
+
+If target LoD = [0, 4, 6], the lod_reset operator will reset the LoD and
+the sequences that the LoDTensor Output(Out) contains becomes:
+
+    [1.0, 2.0, 3.0, 4.0], [5.0, 6.0].
+
+)DOC");
+  }
+};
+
+class LoDResetGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, lod_reset_grad,
+            ops::LoDResetGradOp);
+REGISTER_OP_CPU_KERNEL(lod_reset,
+                       ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
+                       ops::LoDResetKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    lod_reset_grad, ops::LoDResetGradKernel<paddle::platform::CPUPlace, float>,
+    ops::LoDResetGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/lod_reset_op.cu b/paddle/fluid/operators/lod_reset_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8bfc8bd3bf06037d7fcd387dee0514a1e4c6a0f9
--- /dev/null
+++ b/paddle/fluid/operators/lod_reset_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/lod_reset_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    lod_reset, ops::LoDResetKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lod_reset_grad,
+    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a10efee0bdd8c58d23c05bb85f0f882d801848fe
--- /dev/null
+++ b/paddle/fluid/operators/lod_reset_op.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class LoDResetKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    auto* in = ctx.Input<framework::LoDTensor>("X");
+    auto* lod_t = ctx.Input<framework::Tensor>("TargetLoD");
+
+    std::vector<int> level0;
+    if (lod_t) {
+      auto* lod = lod_t->data<int>();
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        framework::Tensor lod_cpu;
+        framework::Copy(*lod_t, platform::CPUPlace(), ctx.device_context(),
+                        &lod_cpu);
+        lod = lod_cpu.data<int>();
+      }
+      level0 = std::vector<int>(lod, lod + lod_t->numel());
+    } else {
+      level0 = ctx.Attr<std::vector<int>>("target_lod");
+    }
+
+    PADDLE_ENFORCE(level0.size() > 1UL,
+                   "The size of target LoD should be greater than 1.");
+    PADDLE_ENFORCE(level0[0] == 0,
+                   "Target LoD should be a vector starting from 0.");
+    PADDLE_ENFORCE(level0.back() == in->dims()[0],
+                   "Target LoD should be a vector end with the "
+                   "first dimension of Input(X).");
+    for (size_t i = 0; i < level0.size() - 1; ++i) {
+      PADDLE_ENFORCE(level0[i + 1] > level0[i],
+                     "Target LoD should be an ascending vector.");
+    }
+
+    out->ShareDataWith(*in);
+    // cast level0 to size_t
+    std::vector<size_t> ulevel0(level0.size(), 0);
+    std::transform(level0.begin(), level0.end(), ulevel0.begin(),
+                   [](int a) { return static_cast<size_t>(a); });
+    framework::LoD target_lod;
+    target_lod.push_back(ulevel0);
+    out->set_lod(target_lod);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LoDResetGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    d_x->ShareDataWith(*d_out);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..edc32bcec1441e50e24612789727db9a044cde54
--- /dev/null
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -0,0 +1,168 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+struct CopyRange {
+  size_t begin;
+  size_t end;
+};
+
+class LoDTensorToArrayOp : public framework::OperatorBase {
+ public:
+  LoDTensorToArrayOp(const std::string &type,
+                     const framework::VariableNameMap &inputs,
+                     const framework::VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s",
+                          Input("X"))
+                  .Get<framework::LoDTensor>();
+    auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")))
+                           .Get<framework::LoDRankTable>();
+    auto &out = *detail::Ref(scope.FindVar(Output("Out")))
+                     .GetMutable<framework::LoDTensorArray>();
+    auto &items = rank_table.items();
+    auto max_seq_len = items[0].length;
+    auto rank_level = rank_table.level();
+
+    PADDLE_ENFORCE_LT(rank_level, x.lod().size(),
+                      "Input should be a LOD tensor, and size is at least %d",
+                      rank_level + 1);
+    out.resize(max_seq_len);
+    std::vector<std::vector<CopyRange>> copy_ranges(max_seq_len);
+
+    // set out[i] lod
+    for (size_t t = 0; t < max_seq_len; t++) {
+      auto &lod = *out[t].mutable_lod();
+      lod.clear();
+      for (auto &item : items) {
+        if (t >= item.length) {
+          break;
+        }
+        size_t start_idx = x.lod()[rank_level][item.index] + t;
+        auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+            x.lod(), start_idx, start_idx + 1, rank_level + 1);
+        auto &lod_length = lod_and_offset.first;
+        framework::AppendLoD(&lod, lod_length);
+        size_t start_offset = lod_and_offset.second.first;
+        size_t end_offset = lod_and_offset.second.second;
+        copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
+      }
+    }
+    for (size_t i = 0; i < max_seq_len; ++i) {
+      auto &ranges = copy_ranges[i];
+      size_t height = std::accumulate(
+          ranges.begin(), ranges.end(), 0UL,
+          [](size_t a, const CopyRange &b) { return a + b.end - b.begin; });
+      auto x_dim = x.dims();
+      x_dim[0] = static_cast<int64_t>(height);
+      out[i].Resize(x_dim);
+      out[i].mutable_data(x.place(), x.type());
+      size_t offset = 0;
+      for (auto &each_range : ranges) {
+        size_t len = each_range.end - each_range.begin;
+        if (len == 0) {
+          continue;
+        }
+        // out[i][offset: offset+len] = x[each_range.begin: each_range.end]
+        auto slice = out[i].Slice(static_cast<int>(offset),
+                                  static_cast<int>(offset + len));
+
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(place);
+
+        framework::Copy(x.Slice(static_cast<int>(each_range.begin),
+                                static_cast<int>(each_range.end)),
+                        x.place(), dev_ctx, &slice);
+        offset += len;
+      }
+    }
+  }
+};
+
+class LoDTensorToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoDTensorToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "");
+    AddInput("RankTable", "");
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class LoDTensorToArrayInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "Input(X) of LoDTensorToArrayOp should not be null.");
+    PADDLE_ENFORCE(
+        context->HasInput("RankTable"),
+        "Input(RankTable) of LoDTensorToArrayOp should not be null.");
+
+    PADDLE_ENFORCE(context->HasOutput("Out"),
+                   "Output(Out) of LoDTensorToArrayOp should not be null.");
+
+    auto x_dim = context->GetInputDim("X");
+    // The first dim of each LoDTensor in Output can only be set at run-time.;
+    // We still have to Resize each LoDTensor in Output.
+    context->SetOutputDim("Out", x_dim);
+  }
+};
+
+class LoDTensorToArrayInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    for (auto &out_var : op_desc.Output("Out")) {
+      block->Var(out_var)->SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY);
+    }
+  }
+};
+
+class LoDTensorToArrayGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("array_to_lod_tensor");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetInput("RankTable", Input("RankTable"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(lod_tensor_to_array, ops::LoDTensorToArrayOp,
+                  ops::LoDTensorToArrayOpProtoMaker,
+                  ops::LoDTensorToArrayInferShape,
+                  ops::LoDTensorToArrayInferVarType,
+                  ops::LoDTensorToArrayGradMaker);
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c5cd2956811329a1ac5da9e42e808c2684dc771
--- /dev/null
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/log_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LogLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
+                   "Input(Predicted) must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) must be initialized.");
+
+    auto pred_dims = ctx->GetInputDim("Predicted");
+    auto label_dims = ctx->GetInputDim("Labels");
+
+    PADDLE_ENFORCE_EQ(pred_dims, label_dims);
+    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
+                      "The rank of Input(Predicted) must be 2 and the shape is "
+                      "[batch_size, 1].");
+    PADDLE_ENFORCE_EQ(pred_dims[1], 1,
+                      "Each row of Input(Predicted) contains a real value, "
+                      "so the 2nd dimension of Input(X) must be 1.");
+
+    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
+    ctx->ShareLoD("Predicted", "Loss");
+  }
+};
+
+template <typename AttrType>
+class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LogLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Predicted",
+             "The input value (Predicted) of Log loss op."
+             "Predicted is a 2-D tensor with shape [batch_size, 1].");
+    AddInput("Labels",
+             "The target value (Labels) of Log loss op."
+             "Labels is a 2-D tensor with shape [batch_size, 1].");
+    AddOutput("Loss",
+              "The output tensor with shape [batch_size, 1] "
+              "which represents the log loss.");
+    AddAttr<AttrType>("epsilon", "Epsilon in log loss.");
+    AddComment(R"DOC(
+LogLoss Operator.
+
+Log loss is a loss function used for binary classification. Log Loss quantifies
+the accuracy of a classifier by penalising false classifications. Minimising the
+Log Loss is equivalent to maximising the accuracy of the classifier. We define
+Predicted as the values predicted by our model and Labels as the target ground
+truth value. Log loss can evaluate how close the predicted values are to the
+target. The shapes of Predicted and Labels are both [batch_size, 1].
+The equation is:
+
+$$
+Loss = - Labels * log(Predicted + \epsilon) -
+        (1 - Labels) * log(1 - Predicted + \epsilon)
+$$
+
+)DOC");
+  }
+};
+
+class LogLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
+                   "Input(Predicted) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@GRAD) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Predicted")),
+                   "Output(Predicted@GRAD) should not be null.");
+
+    auto pred_dims = ctx->GetInputDim("Predicted");
+    auto label_dims = ctx->GetInputDim("Labels");
+    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
+    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
+
+    auto pred_grad_name = framework::GradVarName("Predicted");
+    ctx->SetOutputDim(pred_grad_name, pred_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
+            ops::LogLossGradOp);
+REGISTER_OP_CPU_KERNEL(
+    log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    log_loss_grad,
+    ops::LogLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/log_loss_op.cu b/paddle/fluid/operators/log_loss_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c164a6d04056c2e9d9302b609c1ff4f2b2c4a3f3
--- /dev/null
+++ b/paddle/fluid/operators/log_loss_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/log_loss_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    log_loss, ops::LogLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    log_loss_grad,
+    ops::LogLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/log_loss_op.h b/paddle/fluid/operators/log_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..67fac7cfe55d1d50063afac925863a8fb2eb63a8
--- /dev/null
+++ b/paddle/fluid/operators/log_loss_op.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class LogLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* loss_out = ctx.Output<Tensor>("Loss");
+
+    loss_out->mutable_data<T>(ctx.GetPlace());
+
+    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+
+    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
+    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
+
+    auto loss = EigenVector<T>::Flatten(*loss_out);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    loss.device(place) = (-(label * (prediction + epsilon).log()) -
+                          ((static_cast<T>(1) - label) *
+                           (static_cast<T>(1) - prediction + epsilon).log()));
+  }
+};
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class LogLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+
+    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
+    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
+
+    auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
+
+    auto dl = EigenVector<T>::Flatten(*dloss);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    if (dpred) {
+      dpred->mutable_data<T>(ctx.GetPlace());
+      auto dx = framework::EigenVector<T>::Flatten(*dpred);
+      dx.device(place) = dl * (-(label / (prediction + epsilon)) +
+                               ((static_cast<T>(1) - label) /
+                                (static_cast<T>(1) - prediction + epsilon)));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/logical_op.cc b/paddle/fluid/operators/logical_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff49895df1979bde9dc9f9c7b92601a2a65241da
--- /dev/null
+++ b/paddle/fluid/operators/logical_op.cc
@@ -0,0 +1,152 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/logical_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename OpComment>
+class BinaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BinaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    OpComment comment;
+    AddInput("X",
+             string::Sprintf("(LoDTensor) Left hand operand of %s operator",
+                             comment.type));
+    AddInput("Y",
+             string::Sprintf("(LoDTensor) Right hand operand of %s operator",
+                             comment.type));
+    AddOutput("Out", string::Sprintf(
+                         "(LoDTensor) n-dim bool tensor. Each element is %s",
+                         comment.equation));
+    AddComment(string::Sprintf(R"DOC(%s Operator
+
+It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean tensors.
+Each element of Out is calculated by %s
+)DOC",
+                               comment.type, comment.equation));
+  }
+};
+
+template <typename OpComment>
+class UnaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  UnaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    OpComment comment;
+    AddInput("X", string::Sprintf("(LoDTensor) Operand of %s operator",
+                                  comment.type));
+    AddOutput("Out", string::Sprintf(
+                         "(LoDTensor) n-dim bool tensor. Each element is %s",
+                         comment.equation));
+    AddComment(string::Sprintf(R"DOC(%s Operator
+
+It operates element-wise on X, and returns the Out. X and Out are N-dim boolean tensors.
+Each element of Out is calculated by %s
+)DOC",
+                               comment.type, comment.equation));
+  }
+};
+
+template <typename OpComment>
+class BinaryLogicalOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    OpComment comment;
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "Input(X) of %s operator must not be null", comment.type);
+    PADDLE_ENFORCE(context->HasInput("Y"),
+                   "Input(Y) of %s operator must not be null", comment.type);
+    auto dim_x = context->GetInputDim("X");
+    auto dim_y = context->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(framework::product(dim_x), framework::product(dim_y),
+                      "The number of elements in X and Y should be same");
+
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+    context->ShareLoD("X", "Out");
+  }
+};
+
+template <typename OpComment>
+class UnaryLogicalOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    OpComment comment;
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "Input(X) of %s operator must not be null", comment.type);
+    auto dim_x = context->GetInputDim("X");
+
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+    context->ShareLoD("X", "Out");
+  }
+};
+
+class LogicalOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
+    // LogicalOp kernel's device type is decided by input tensor place
+    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+    return kt;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_BINARY_LOGICAL_OP(op_type, _equation)                     \
+  struct _##op_type##Comment {                                             \
+    static char type[];                                                    \
+    static char equation[];                                                \
+  };                                                                       \
+  char _##op_type##Comment::type[]{#op_type};                              \
+  char _##op_type##Comment::equation[]{_equation};                         \
+  REGISTER_OPERATOR(                                                       \
+      op_type, ::paddle::operators::LogicalOp,                             \
+      ::paddle::operators::BinaryLogicalOpProtoMaker<_##op_type##Comment>, \
+      ::paddle::operators::BinaryLogicalOpInferShape<_##op_type##Comment>, \
+      ::paddle::framework::EmptyGradOpMaker);
+
+#define REGISTER_UNARY_LOGICAL_OP(op_type, _equation)                     \
+  struct _##op_type##Comment {                                            \
+    static char type[];                                                   \
+    static char equation[];                                               \
+  };                                                                      \
+  char _##op_type##Comment::type[]{#op_type};                             \
+  char _##op_type##Comment::equation[]{_equation};                        \
+  REGISTER_OPERATOR(                                                      \
+      op_type, ::paddle::operators::LogicalOp,                            \
+      ::paddle::operators::UnaryLogicalOpProtoMaker<_##op_type##Comment>, \
+      ::paddle::operators::UnaryLogicalOpInferShape<_##op_type##Comment>, \
+      ::paddle::framework::EmptyGradOpMaker);
+
+REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$");
+REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU,
+                               paddle::operators::LogicalAndFunctor);
+REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$");
+REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU,
+                               paddle::operators::LogicalOrFunctor);
+REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
+REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
+                              paddle::operators::LogicalNotFunctor);
+REGISTER_BINARY_LOGICAL_OP(logical_xor,
+                           "$$Out = (X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$");
+REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
+                               paddle::operators::LogicalXorFunctor);
diff --git a/paddle/fluid/operators/logical_op.cu b/paddle/fluid/operators/logical_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2b17444061252b714cd9bbaadc7fcf877628ef89
--- /dev/null
+++ b/paddle/fluid/operators/logical_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/logical_op.h"
+
+REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CUDA,
+                               paddle::operators::LogicalAndFunctor);
+REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CUDA,
+                               paddle::operators::LogicalOrFunctor);
+REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CUDA,
+                              paddle::operators::LogicalNotFunctor);
+REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CUDA,
+                               paddle::operators::LogicalXorFunctor);
diff --git a/paddle/fluid/operators/logical_op.h b/paddle/fluid/operators/logical_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6d5866c2c8e4ce54e9556a1acf69414dba523d2
--- /dev/null
+++ b/paddle/fluid/operators/logical_op.h
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <math.h>
+#include <type_traits>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct LogicalAndFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a && b; }
+};
+
+template <typename T>
+struct LogicalOrFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a || b; }
+};
+
+template <typename T>
+struct LogicalNotFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a) const { return !a; }
+};
+
+template <typename T>
+struct LogicalXorFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const {
+    return (a || b) && !(a && b);
+  }
+};
+
+template <typename DeviceContext, typename Functor>
+class BinaryLogicalOpKernel
+    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using T = typename Functor::ELEM_TYPE;
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* y = context.Input<framework::Tensor>("Y");
+    auto* out = context.Output<framework::Tensor>("Out");
+    Functor binary_func;
+    platform::Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x->data<T>(),
+          x->data<T>() + x->numel(), y->data<T>(),
+          out->mutable_data<bool>(context.GetPlace()), binary_func);
+  }
+};
+
+template <typename DeviceContext, typename Functor>
+class UnaryLogicalOpKernel
+    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using T = typename Functor::ELEM_TYPE;
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    Functor unary_func;
+    platform::Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x->data<T>(),
+          x->data<T>() + x->numel(),
+          out->mutable_data<bool>(context.GetPlace()), unary_func);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \
+  REGISTER_OP_##dev##_KERNEL(                                 \
+      op_type, ::paddle::operators::BinaryLogicalOpKernel<    \
+                   ::paddle::platform::dev##DeviceContext, functor<bool>>);
+
+#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \
+  REGISTER_OP_##dev##_KERNEL(                                \
+      op_type, ::paddle::operators::UnaryLogicalOpKernel<    \
+                   ::paddle::platform::dev##DeviceContext, functor<bool>>);
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c555f1a3fa228215812b7d3291e882b0d42bb64
--- /dev/null
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -0,0 +1,147 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/lookup_table_op.h"
+#include "paddle/fluid/framework/var_type_inference.h"
+
+namespace paddle {
+namespace operators {
+
+class LookupTableOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("W"),
+                   "Input(W) of LookupTableOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Ids"),
+                   "Input(Ids) of LookupTableOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LookupTableOp should not be null.");
+
+    auto table_dims = ctx->GetInputDim("W");
+    auto ids_dims = ctx->GetInputDim("Ids");
+
+    PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
+    PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+
+    ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]});
+    ctx->ShareLoD("Ids", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
+        ctx.device_context());
+  }
+};
+
+class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LookupTableOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("W",
+             "An input represents embedding tensors, "
+             "which is a learnable parameter.");
+    AddInput("Ids",
+             "An input with type int32 or int64 "
+             "contains the ids to be looked up in W. "
+             "Ids must be a column vector with rank = 2. "
+             "The 2nd dimension size must be 1.");
+    AddOutput("Out", "The lookup results, which have the same type as W.");
+    AddAttr<bool>("is_sparse",
+                  "(boolean, default false) "
+                  "Sparse update")
+        .SetDefault(false);
+    AddAttr<int64_t>("padding_idx",
+                     "(int64, default -1) "
+                     "If the value is -1, it makes no effect to lookup. "
+                     "Otherwise the given value indicates padding the output "
+                     "with zeros whenever lookup encounters it in Ids.")
+        .SetDefault(-1);
+    AddComment(R"DOC(
+Lookup Table Operator.
+
+This operator is used to perform lookups on the parameter W,
+then concatenated into a dense tensor.
+
+The input Ids can carry the LoD (Level of Details) information,
+or not. And the output only shares the LoD information with input Ids.
+
+)DOC");
+  }
+};
+
+class LookupTableOpGradDescMaker
+    : public framework::DefaultGradOpDescMaker<true> {
+  using ::paddle::framework::DefaultGradOpDescMaker<
+      true>::DefaultGradOpDescMaker;
+
+ protected:
+  virtual std::string GradOpType() const { return "lookup_table_grad"; }
+};
+
+class LookupTableOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto table_dims = ctx->GetInputDim("W");
+    ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
+        ctx.device_context());
+  }
+};
+
+class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
+    auto attr = op_desc.GetAttr("is_sparse");
+    bool is_sparse = boost::get<bool>(attr);
+    if (is_sparse) {
+      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
+              << " is set to SelectedRows";
+      block->Var(out_var_name)
+          ->SetType(framework::proto::VarDesc::SELECTED_ROWS);
+    } else {
+      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
+              << " is set to LoDTensor";
+      block->Var(out_var_name)->SetType(framework::proto::VarDesc::LOD_TENSOR);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(lookup_table, ops::LookupTableOp,
+                  ops::LookupTableOpGradDescMaker, ops::LookupTableOpMaker);
+REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad,
+                  ops::LookupTableOpGradVarTypeInference);
+
+REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel<float>,
+                       ops::LookupTableKernel<double>);
+REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel<float>,
+                       ops::LookupTableGradKernel<double>);
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..801adba5a440ebda9506717603e8f9665ea9e6ba
--- /dev/null
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -0,0 +1,176 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/lookup_table_op.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int BlockDimX, int BlockDimY, int GridDimX,
+          bool PaddingFlag>
+__global__ void LookupTable(T* output, const T* table, const int64_t* ids,
+                            const int64_t N, const int64_t K, const int64_t D,
+                            const int64_t padding_idx) {
+  int idx = threadIdx.x;
+  int idy = blockIdx.x + threadIdx.y * GridDimX;
+
+  while (idy < K) {
+    int64_t id = ids[idy];
+    PADDLE_ASSERT(id >= 0);
+    PADDLE_ASSERT(id < N);
+    T* out = output + idy * D;
+    const T* tab = table + id * D;
+    for (int i = idx; i < D; i += BlockDimX) {
+      if (PaddingFlag) {
+        if (id == padding_idx)
+          out[i] = static_cast<T>(0);
+        else
+          out[i] = tab[i];
+      } else {
+        out[i] = tab[i];
+      }
+    }
+    idy += BlockDimY * GridDimX;
+  }
+}
+
+template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
+__global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
+                                const int64_t N, const int64_t K,
+                                const int64_t D) {
+  int idx = threadIdx.x;
+  int idy = blockIdx.x + threadIdx.y * GridDimX;
+
+  while (idy < K) {
+    int id = ids[idy];
+    PADDLE_ASSERT(id >= 0);
+    PADDLE_ASSERT(id < N);
+    const T* out = output + idy * D;
+    T* tab = table + id * D;
+    for (int i = idx; i < D; i += BlockDimX) {
+      paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
+    }
+    idy += BlockDimY * GridDimX;
+  }
+}
+
+template <typename T>
+class LookupTableCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* table_t = context.Input<LoDTensor>("W");
+    auto* ids_t = context.Input<LoDTensor>("Ids");
+    auto* output_t = context.Output<LoDTensor>("Out");
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+
+    size_t N = table_t->dims()[0];
+    size_t D = table_t->dims()[1];
+    size_t K = ids_t->numel();
+    auto* ids = ids_t->data<int64_t>();
+    auto* table = table_t->data<T>();
+    auto* output = output_t->mutable_data<T>(context.GetPlace());
+
+    dim3 threads(128, 8);
+    dim3 grids(8, 1);
+
+    if (padding_idx == -1)
+      LookupTable<
+          T, 128, 8, 8,
+          false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+          output, table, ids, N, K, D, padding_idx);
+    else
+      LookupTable<
+          T, 128, 8, 8,
+          true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+          output, table, ids, N, K, D, padding_idx);
+  }
+};
+
+template <typename T>
+class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    bool is_sparse = context.Attr<bool>("is_sparse");
+    // Since paddings are not trainable and fixed in forward, the gradient of
+    // paddings makes no sense and we don't deal with it in backward.
+    if (is_sparse) {
+      auto* ids = context.Input<LoDTensor>("Ids");
+      auto* table = context.Input<LoDTensor>("W");
+      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+
+      auto* ids_data = ids->data<int64_t>();
+      auto ids_dim = ids->dims();
+
+      auto stream = dev_ctx.stream();
+      // copy GPU memory to CPU pinned memory
+      framework::Vector<int64_t> new_rows;
+      new_rows.resize(ids_dim[0]);
+      auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
+
+      // TODO(yuyang18): Strange code here.
+      memory::Copy(platform::CPUPlace(),
+                   new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
+                   ids_data, ids_dim[0] * sizeof(int64_t), stream);
+
+      d_table->set_rows(new_rows);
+
+      auto* d_table_value = d_table->mutable_value();
+      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      d_table_value->mutable_data<T>(context.GetPlace());
+
+      auto* d_table_data = d_table_value->data<T>();
+      auto* d_output_data = d_output->data<T>();
+      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
+      memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data,
+                   d_output->numel() * sizeof(T), stream);
+
+    } else {
+      auto ids_t = context.Input<LoDTensor>("Ids");
+      auto d_output_t = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto d_table_t = context.Output<LoDTensor>(framework::GradVarName("W"));
+
+      int N = d_table_t->dims()[0];
+      int D = d_table_t->dims()[1];
+      int K = ids_t->numel();
+      const int64_t* ids = ids_t->data<int64_t>();
+      const T* d_output = d_output_t->data<T>();
+      T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
+
+      auto t = framework::EigenVector<T>::Flatten(*d_table_t);
+      t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
+
+      dim3 threads(128, 8);
+      dim3 grids(8, 1);
+      LookupTableGrad<T, 128, 8, 8><<<grids, threads, 0, dev_ctx.stream()>>>(
+          d_table, d_output, ids, N, K, D);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>,
+                        ops::LookupTableCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(lookup_table_grad,
+                        ops::LookupTableGradCUDAKernel<float>,
+                        ops::LookupTableGradCUDAKernel<double>);
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d264496882a9e1828953d843d4a18fe4c16b1d24
--- /dev/null
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -0,0 +1,126 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
+
+template <typename T>
+class LookupTableKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* table_t = context.Input<LoDTensor>("W");      // float tensor
+    auto* ids_t = context.Input<LoDTensor>("Ids");      // int tensor
+    auto* output_t = context.Output<LoDTensor>("Out");  // float tensor
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+
+    int N = table_t->dims()[0];
+    int D = table_t->dims()[1];
+    auto* ids = ids_t->data<int64_t>();
+    auto* table = table_t->data<T>();
+    auto* output = output_t->mutable_data<T>(context.GetPlace());
+
+    if (padding_idx == -1) {
+      for (int64_t i = 0; i < ids_t->numel(); ++i) {
+        PADDLE_ENFORCE_LT(ids[i], N);
+        PADDLE_ENFORCE_GE(ids[i], 0);
+        memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+      }
+    } else {
+      for (int64_t i = 0; i < ids_t->numel(); ++i) {
+        if (ids[i] == padding_idx) {
+          memset(output + i * D, 0, D * sizeof(T));
+        } else {
+          PADDLE_ENFORCE_LT(ids[i], N);
+          PADDLE_ENFORCE_GE(ids[i], 0);
+          memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class LookupTableGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool is_sparse = context.Attr<bool>("is_sparse");
+    // Since paddings are not trainable and fixed in forward, the gradient of
+    // paddings makes no sense and we don't deal with it in backward.
+    if (is_sparse) {
+      auto* ids = context.Input<LoDTensor>("Ids");
+      auto* table = context.Input<LoDTensor>("W");
+      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+
+      auto* ids_data = ids->data<int64_t>();
+      auto ids_dim = ids->dims();
+
+      framework::Vector<int64_t> new_rows;
+      new_rows.reserve(ids_dim[0]);
+      for (int64_t i = 0; i < ids_dim[0]; i++) {
+        new_rows.push_back(ids_data[i]);
+      }
+      d_table->set_rows(new_rows);
+
+      auto* d_table_value = d_table->mutable_value();
+      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      d_table_value->mutable_data<T>(context.GetPlace());
+
+      d_table->set_height(table->dims()[0]);
+
+      auto* d_output_data = d_output->data<T>();
+      auto* d_table_data = d_table_value->data<T>();
+
+      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
+      memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
+    } else {
+      auto* ids = context.Input<LoDTensor>("Ids");
+      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto* d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
+      auto* table = context.Input<LoDTensor>("W");
+
+      auto* ids_data = ids->data<int64_t>();
+      auto ids_dim = ids->dims();
+
+      int N = table->dims()[0];
+      int D = d_output->dims()[1];
+
+      auto* d_output_data = d_output->data<T>();
+      auto* d_table_data = d_table->mutable_data<T>(context.GetPlace());
+
+      memset(d_table_data, 0, d_table->numel() * sizeof(T));
+
+      for (int64_t i = 0; i < ids->numel(); ++i) {
+        PADDLE_ENFORCE_LT(ids_data[i], N);
+        PADDLE_ENFORCE_GE(ids_data[i], 0);
+        for (int j = 0; j < D; ++j) {
+          d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j];
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c84507f231c6d3c4c5d6e33719fafc2752876f03
--- /dev/null
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -0,0 +1,236 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/lrn_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+struct LRNFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out,
+                  framework::Tensor* mid, int N, int C, int H, int W, int n,
+                  T k, T alpha, T beta) {
+    auto x_v = framework::EigenVector<T>::Flatten(input);
+
+    const int start = -(n - 1) / 2;
+    const int end = start + n;
+
+    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
+    e_mid = e_mid.constant(k);
+
+    auto e_x = framework::EigenTensor<T, 4>::From(input);
+    for (int m = 0; m < N; m++) {
+      for (int i = 0; i < C; i++) {
+        for (int c = start; c <= end; c++) {
+          int ch = i + c;
+          if (ch >= 0 && ch < C) {
+            auto s = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+            auto r = e_x.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                               Eigen::array<int, 4>({{1, 1, H, W}}));
+
+            s += alpha * r.square();
+          }
+        }
+      }
+    }
+
+    auto out_e = framework::EigenVector<T>::Flatten(*out);
+    out_e = x_v * e_mid.reshape(Eigen::DSizes<int, 1>(e_mid.size())).pow(-beta);
+  }
+};
+template struct LRNFunctor<platform::CPUDeviceContext, float>;
+template struct LRNFunctor<platform::CPUDeviceContext, double>;
+
+template <typename T>
+struct LRNGradFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& x, const framework::Tensor& out,
+                  const framework::Tensor& mid, framework::Tensor* x_g,
+                  const framework::Tensor& out_g, int N, int C, int H, int W,
+                  int n, T alpha, T beta) {
+    T ratio = -2 * alpha * beta;
+    auto x_g_e = framework::EigenVector<T>::Flatten(*x_g);
+    x_g_e = x_g_e.constant(0.0);
+
+    auto e_x = framework::EigenTensor<T, 4>::From(x);
+    auto e_x_g = framework::EigenTensor<T, 4>::From(*x_g);
+    auto e_out = framework::EigenTensor<T, 4>::From(out);
+    auto e_out_g = framework::EigenTensor<T, 4>::From(out_g);
+    auto e_mid = framework::EigenTensor<T, 4>::From(mid);
+
+    const int start = -(n - 1) / 2;
+    const int end = start + n;
+    for (int m = 0; m < N; m++) {
+      for (int i = 0; i < C; i++) {
+        auto i_x = e_x.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                             Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_x_g = e_x_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                     Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_mid = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        i_x_g = i_mid.pow(-beta) * i_out_g;
+        for (int c = start; c <= end; c++) {
+          int ch = i + c;
+          if (ch < 0 || ch >= C) {
+            continue;
+          }
+
+          auto c_out = e_out.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                   Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          auto c_mid = e_mid.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                   Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          auto c_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                       Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          i_x_g += ratio * c_out_g * c_out * i_x / c_mid;
+        }
+      }
+    }
+  }
+};
+template struct LRNGradFunctor<platform::CPUDeviceContext, float>;
+template struct LRNGradFunctor<platform::CPUDeviceContext, double>;
+
+class LRNOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LRNOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LRNOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MidOut"),
+                   "MidOut(Out) of LRNOp should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4.");
+
+    ctx->SetOutputDim("Out", x_dim);
+    ctx->SetOutputDim("MidOut", x_dim);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+template <typename T>
+class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LRNOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor) The input of LRN operator. "
+             "It must be a 4D tenor with NCHW format.");
+    AddOutput("Out",
+              "(Tensor) The output of LRN operator, which is also the 4D "
+              "tensor with NCHW format.");
+    AddOutput("MidOut",
+              "(Tensor) Middle result of LRN operator. It's computed in "
+              "forward process and also used in backward process.");
+
+    AddAttr<int>("n",
+                 "(int default 5) "
+                 "n is the \"adjacent\" kernel that maps "
+                 "at the same spatial position.")
+        .SetDefault(5)
+        .GreaterThan(0);
+
+    AddAttr<T>("k",
+               "(float, default 2.0) "
+               "k is the bias.")
+        .SetDefault(2.0)
+        .GreaterThan(0.0);
+
+    AddAttr<T>("alpha",
+               "(float, default 0.0001) "
+               "alpha is the scale number.")
+        .SetDefault(0.0001)
+        .GreaterThan(0.0);
+
+    AddAttr<T>("beta",
+               "(float, default 0.75) "
+               "beta is the power number.")
+        .SetDefault(0.75)
+        .GreaterThan(0.0);
+
+    AddComment(R"DOC(
+Local Response Normalization Operator.
+
+This operator comes from the paper:
+<<ImageNet Classification with Deep Convolutional Neural Networks>>.
+
+The original formula is:
+
+$$
+Output(i, x, y) = Input(i, x, y) / \left(
+k + \alpha \sum\limits^{\min(C, c + n/2)}_{j = \max(0, c - n/2)}
+(Input(j, x, y))^2
+\right)^{\beta}
+$$
+
+Function implementation:
+
+Inputs and outpus are in NCHW format, while input.shape.ndims() equals 4.
+And dimensions 0 ~ 3 represent batch size, feature maps, rows,
+and columns, respectively.
+
+Input and Output in the formula above is for each map(i) of one image, and
+Input(i, x, y), Output(i, x, y) represents an element in an image.
+
+C is the number of feature maps of one image. n is a hyper-parameter
+configured when operator is initialized. The sum in the denominator
+is the sum of the same positions in the neighboring maps.
+
+)DOC");
+  }
+};
+
+class LRNOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("MidOut"), "Input(MidOut) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lrn, ops::LRNOp, ops::LRNOpMaker<float>, lrn_grad, ops::LRNOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    lrn, ops::LRNKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    lrn_grad, ops::LRNGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/lrn_op.cu b/paddle/fluid/operators/lrn_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..03112bf3e03595a521c22cd914f414f026970c10
--- /dev/null
+++ b/paddle/fluid/operators/lrn_op.cu
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/lrn_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void KeCMRNormFillScale(int img_size, const T* in, T* mid, int C,
+                                   int H, int W, int size, T k, T alpha) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < img_size) {
+    const int w = idx % W;
+    const int h = (idx / W) % H;
+    const int n = idx / W / H;
+    const int offset = (n * C * H + h) * W + w;
+
+    in += offset;
+    mid += offset;
+    const int step = H * W;
+    const int pre_pad = (size - 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+
+    T accum = 0;
+    int index = 0;
+    while (index < C + post_pad) {
+      if (index < C) {
+        T val = in[index * step];
+        accum += val * val;
+      }
+      if (index >= size) {
+        T val = in[(index - size) * step];
+        accum -= val * val;
+      }
+      if (index >= post_pad) {
+        mid[(index - post_pad) * step] = k + accum * alpha;
+      }
+      ++index;
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeCMRNormOutput(int input_size, const T* in, const T* mid,
+                                T negative_beta, T* out) {
+  const int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < input_size) {
+    out[index] = in[index] * pow(mid[index], negative_beta);
+  }
+}
+
+template <typename T>
+void CrossMapNormal(const framework::ExecutionContext& ctx, const T* inputs,
+                    T* outputs, T* mid, int N, int C, int H, int W, int n, T k,
+                    T alpha, T beta) {
+  int img_size = N * H * W;
+  const int block_size = 1024;
+  int grid_size = (img_size + block_size - 1) / block_size;
+
+  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  KeCMRNormFillScale<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      img_size, inputs, mid, C, H, W, n, k, alpha);
+
+  int input_size = N * H * W * C;
+  grid_size = (input_size + block_size - 1) / block_size;
+  KeCMRNormOutput<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      input_size, inputs, mid, -beta, outputs);
+}
+
+template <typename T>
+struct LRNFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out,
+                  framework::Tensor* mid, int N, int C, int H, int W, int n,
+                  T k, T alpha, T beta) {
+    CrossMapNormal<T>(
+        ctx, input.data<T>(), out->mutable_data<T>(ctx.GetPlace()),
+        mid->mutable_data<T>(ctx.GetPlace()), N, C, H, W, n, k, alpha, beta);
+  }
+};
+
+template struct LRNFunctor<platform::CUDADeviceContext, float>;
+template struct LRNFunctor<platform::CUDADeviceContext, double>;
+
+template <typename T>
+__global__ void KeCMRNormDiff(int img_size, const T* x, const T* out,
+                              const T* mid, T* x_g, const T* out_g, int C,
+                              int H, int W, int size, T negative_beta,
+                              T ratio) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < img_size) {
+    const int w = idx % W;
+    const int h = (idx / W) % H;
+    const int n = idx / W / H;
+    const int offset = (n * C * H + h) * W + w;
+    x += offset;
+    out += offset;
+    mid += offset;
+    out_g += offset;
+    x_g += offset;
+
+    const int step = H * W;
+    const int pre_pad = size - (size + 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+
+    int index = 0;
+    T accum = 0;
+    // TODO(gongwb): optimize this with thread shared array.
+    while (index < C + post_pad) {
+      if (index < C) {
+        x_g[index * step] = 0.0;
+        accum += out_g[index * step] * out[index * step] / mid[index * step];
+      }
+      if (index >= size) {
+        accum -= out_g[(index - size) * step] * out[(index - size) * step] /
+                 mid[(index - size) * step];
+      }
+      if (index >= post_pad) {
+        x_g[(index - post_pad) * step] +=
+            out_g[(index - post_pad) * step] *
+                pow(mid[(index - post_pad) * step], negative_beta) -
+            ratio * x[(index - post_pad) * step] * accum;
+      }
+      ++index;
+    }
+  }
+}
+
+template <typename T>
+void CrossMapNormalGrad(const framework::ExecutionContext& ctx, const T* x,
+                        const T* out, const T* mid, T* x_g, const T* out_g,
+                        int N, int C, int H, int W, int n, T alpha, T beta) {
+  int img_size = N * H * W;
+
+  const int block_size = 1024;
+  int grid_size = (img_size + block_size - 1) / block_size;
+
+  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  KeCMRNormDiff<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      img_size, x, out, mid, x_g, out_g, C, H, W, n, -beta,
+      2.0f * alpha * beta);
+}
+
+template <typename T>
+struct LRNGradFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& x, const framework::Tensor& out,
+                  const framework::Tensor& mid, framework::Tensor* x_g,
+                  const framework::Tensor& out_g, int N, int C, int H, int W,
+                  int n, T alpha, T beta) {
+    CrossMapNormalGrad<T>(ctx, x.data<T>(), out.data<T>(), mid.data<T>(),
+                          x_g->mutable_data<T>(ctx.GetPlace()), out_g.data<T>(),
+                          N, C, H, W, n, alpha, beta);
+  }
+};
+
+template struct LRNGradFunctor<platform::CUDADeviceContext, float>;
+template struct LRNGradFunctor<platform::CUDADeviceContext, double>;
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    lrn, ops::LRNKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    lrn_grad, ops::LRNGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7b78b459145bae5483e3f12b3d872c679823740
--- /dev/null
+++ b/paddle/fluid/operators/lrn_op.h
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename place, typename T>
+struct LRNFunctor {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out,
+                  framework::Tensor* mid, int N, int C, int H, int W, int n,
+                  T k, T alpha, T beta);
+};
+
+template <typename DeviceContext, typename T>
+class LRNKernel : public framework::OpKernel<T> {
+ public:
+  using Tensor = framework::Tensor;
+
+  // f(x) = x * ( k + alpha * SUM((x)^2) )^(-beta)
+  // x represents inputs
+  // f(x) represents outputs
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // input
+    const Tensor& x = *ctx.Input<Tensor>("X");
+    auto x_dims = x.dims();
+
+    // NCHW
+    int N = x_dims[0];
+    int C = x_dims[1];
+    int H = x_dims[2];
+    int W = x_dims[3];
+
+    Tensor* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    // MidOut save the intermediate result for backward
+    Tensor* mid = ctx.Output<Tensor>("MidOut");
+    mid->mutable_data<T>(ctx.GetPlace());
+
+    int n = ctx.Attr<int>("n");
+    T alpha = ctx.Attr<float>("alpha");
+    T beta = ctx.Attr<float>("beta");
+    T k = ctx.Attr<float>("k");
+
+    PADDLE_ENFORCE(n > 0, "n should >= 0");
+    PADDLE_ENFORCE(alpha >= 0.0, "alpha should >= 0.0");
+    PADDLE_ENFORCE(beta >= 0.0, "beta should >= 0.0");
+    PADDLE_ENFORCE(k >= 0.0, "k should >= 0.0");
+
+    LRNFunctor<DeviceContext, T> f;
+    f(ctx, x, out, mid, N, C, H, W, n, k, alpha, beta);
+  }
+};
+
+template <typename DeviceContext, typename T>
+struct LRNGradFunctor {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& x, const framework::Tensor& out,
+                  const framework::Tensor& mid, framework::Tensor* x_g,
+                  const framework::Tensor& out_g, int N, int C, int H, int W,
+                  int n, T alpha, T beta);
+};
+
+/**
+ * \brief Backward calculation for normalization with across maps.
+ *
+ * Function implementation:
+ *
+ * The implementation of this Function is derived from the
+ * CrossMapNormalFunc implementation.
+ *
+ * InputGrad = OutputGrad * MidOut ^ (-beta)
+ *    -- upper
+ *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / MidOut) * InputValue
+ *    -- lower
+ *
+ * The data of inputs/outputs format is the same as the forward interface
+ * and is NCHW.
+ *
+ * The upper and lower is the same as forward. The logic of the sum
+ * is also the same as forward.
+ */
+template <typename DeviceContext, typename T>
+class LRNGradKernel : public framework::OpKernel<T> {
+ public:
+  using Tensor = framework::Tensor;
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor& x = *ctx.Input<Tensor>("X");
+    const Tensor& out = *ctx.Input<Tensor>("Out");
+    const Tensor& out_g = *ctx.Input<Tensor>(framework::GradVarName("Out"));
+    const Tensor& mid = *ctx.Input<Tensor>("MidOut");
+
+    auto x_g = ctx.Output<Tensor>(framework::GradVarName("X"));
+    x_g->mutable_data<T>(ctx.GetPlace());
+
+    auto x_dims = x.dims();
+    int N = x_dims[0];
+    int C = x_dims[1];
+    int H = x_dims[2];
+    int W = x_dims[3];
+
+    int n = ctx.Attr<int>("n");
+    T alpha = ctx.Attr<T>("alpha");
+    T beta = ctx.Attr<T>("beta");
+
+    LRNGradFunctor<DeviceContext, T> f;
+    f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d1f1b5f235f991e7a4d84c815bc5dda74ab64752
--- /dev/null
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -0,0 +1,281 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/lstm_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LSTMOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of LSTM should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                   "Output(Hidden) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Cell"),
+                   "Output(Cell) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
+                   "Output(BatchGate) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
+                   "Output(BatchGate) of LSTM should not be null.");
+
+    auto in_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2, "Input(X)'s rank must be 2.");
+
+    if (ctx->HasInput("H0")) {
+      PADDLE_ENFORCE(ctx->HasInput("C0"),
+                     "Input(Cell) and Input(Hidden) of LSTM should not "
+                     "be null at the same time.");
+      auto h_dims = ctx->GetInputDim("H0");
+      auto c_dims = ctx->GetInputDim("C0");
+      PADDLE_ENFORCE(h_dims == c_dims,
+                     "The dimension of Input(H0) and Input(C0) "
+                     "should be the same.");
+    }
+
+    int frame_size = in_dims[1] / 4;
+    auto w_dims = ctx->GetInputDim("Weight");
+    PADDLE_ENFORCE_EQ(w_dims.size(), 2,
+                      "The rank of Input(Weight) should be 2.");
+    PADDLE_ENFORCE_EQ(w_dims[0], frame_size,
+                      "The first dimension of Input(Weight) "
+                      "should be %d.",
+                      frame_size);
+    PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size,
+                      "The second dimension of Input(Weight) "
+                      "should be 4 * %d.",
+                      frame_size);
+
+    auto b_dims = ctx->GetInputDim("Bias");
+    PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
+    PADDLE_ENFORCE_EQ(b_dims[0], 1,
+                      "The first dimension of Input(Bias) should be 1.");
+
+    if (ctx->Attrs().Get<bool>("use_peepholes")) {
+      PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
+                        "The second dimension of Input(Bias) should be "
+                        "7 * %d if enable peepholes connection",
+                        frame_size);
+    } else {
+      PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
+                        "The second dimension of Input(Bias) should be "
+                        "4 * %d if disable peepholes connection",
+                        frame_size);
+    }
+
+    framework::DDim out_dims({in_dims[0], frame_size});
+    ctx->SetOutputDim("Hidden", out_dims);
+    ctx->SetOutputDim("Cell", out_dims);
+    ctx->SetOutputDim("BatchGate", in_dims);
+    ctx->SetOutputDim("BatchCellPreAct", out_dims);
+    ctx->ShareLoD("Input", "Hidden");
+    ctx->ShareLoD("Input", "Cell");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LSTMOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(LoDTensor) the first input is a LodTensor, which support "
+             "variable-time length input sequence. The underlying tensor in "
+             "this LoDTensor is a matrix with shape (T X 4D), where T is the "
+             "total time steps in this mini-batch, D is the hidden size.");
+    AddInput("H0",
+             "(Tensor, optional) the initial hidden state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size and D is the hidden size.")
+        .AsDispensable();
+    AddInput("C0",
+             "(Tensor, optional) the initial cell state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size. `H0` and `C0` can be NULL but only at the same time.")
+        .AsDispensable();
+    AddInput("Weight",
+             "(Tensor) the learnable hidden-hidden weights."
+             " - The shape is (D x 4D), where D is the hidden size. "
+             " - Weight = {W_ch, W_ih, W_fh, W_oh}");
+    AddInput("Bias",
+             "(Tensor) the learnable weights, which contains two parts: "
+             "input-hidden bias weight and peephole connections weight if "
+             "setting `use_peepholes` True. "
+             "1. `use_peepholes = False` "
+             " - The shape is (1 x 4D). "
+             " - Bias = {b_c, b_i, b_f, b_o}."
+             "2. `use_peepholes = True` "
+             " - The shape is (1 x 7D). "
+             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
+    AddOutput("Hidden",
+              "(LoDTensor) the hidden state of LSTM operator. "
+              "The shape is (T x D), and lod is the same with the `Input`.");
+    AddOutput("Cell",
+              "(LoDTensor) the cell state of LSTM operator. "
+              "The shape is (T x D), and lod is the same with the `Input`.");
+    AddOutput("BatchGate",
+              "(LoDTensor) This LoDTensor contains input gate, forget gate "
+              "and output gate after the nonlinear computation. This "
+              "LoDTensor has the same shape as the reorganized input, which "
+              "is also be called batch input. The LoD size is 2. The first "
+              "LoD is the batch offsets and the second LoD contains the "
+              "indexes, which denote the position of reorganized sequence "
+              "in the raw input.")
+        .AsIntermediate();
+    AddOutput("BatchCellPreAct",
+              "(LoDTensor) This LoDTensor is obtained in the forward and used "
+              "in the backward.")
+        .AsIntermediate();
+    AddAttr<bool>("use_peepholes",
+                  "(bool, defalut: True) "
+                  "whether to enable diagonal/peephole connections.")
+        .SetDefault(true);
+    AddAttr<bool>("is_reverse",
+                  "(bool, defalut: False) "
+                  "whether to compute reversed LSTM.")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "gate_activation",
+        "(string, default: sigmoid)"
+        "The activation for input gate, forget gate and output "
+        "gate, `sigmoid` by default.")
+        .SetDefault("sigmoid")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("cell_activation",
+                         "(string, default: tanh)"
+                         "The activation for cell output, `tanh` by defalut.")
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("candidate_activation",
+                         "(string, default: tanh)"
+                         "The activation for candidate hidden state, "
+                         "`tanh` by default.")
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddComment(R"DOC(
+Long-Short Term Memory (LSTM) Operator.
+
+The defalut implementation is diagonal/peephole connection
+(https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:
+
+$$
+i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) \\
+
+f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) \\
+
+\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) \\
+
+o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) \\
+
+c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
+
+h_t = o_t \odot act_h(c_t)
+$$
+
+where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
+of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
+are diagonal weight matrices for peephole connections. In our implementation,
+we use vectors to reprenset these diagonal weight matrices. The b terms
+denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
+is the non-line activations, such as logistic sigmoid function, and
+$i, f, o$ and $c$ are the input gate, forget gate, output gate,
+and cell activation vectors, respectively, all of which have the same size as
+the cell output activation vector $h$.
+
+The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
+are the cell input and cell output activation functions and `tanh` is usually
+used for them. $\tilde{c_t}$ is also called candidate hidden state,
+which is computed based on the current input and the previous hidden state.
+
+Set `use_peepholes` False to disable peephole connection. The formula
+is omitted here, please refer to the paper
+http://www.bioinf.jku.at/publications/older/2604.pdf for details.
+
+Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
+operations on the input $x_{t}$ are NOT included in this operator.
+Users can choose to use fully-connect operator before LSTM operator.
+
+)DOC");
+  }
+};
+
+class LSTMGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
+                   "Input(Hidden) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Cell"),
+                   "Input(Cell) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of LSTM should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
+                   "Input(BatchGate) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"),
+                   "Input(BatchGate) of LSTM should not be null.");
+
+    auto SetOutGradDim = [&ctx](const std::string& name) {
+      auto g_name = framework::GradVarName(name);
+      if (ctx->HasOutput(g_name))
+        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
+    };
+
+    SetOutGradDim("Input");
+    SetOutGradDim("Weight");
+    SetOutGradDim("Bias");
+    SetOutGradDim("H0");
+    SetOutGradDim("C0");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp);
+REGISTER_OP_CPU_KERNEL(
+    lstm, ops::LSTMKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LSTMKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    lstm_grad, ops::LSTMGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LSTMGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/lstm_op.cu.cc b/paddle/fluid/operators/lstm_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..679d02b1f9a1d1f0313b5c8109285c92734e3e5a
--- /dev/null
+++ b/paddle/fluid/operators/lstm_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/lstm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    lstm, ops::LSTMKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LSTMKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lstm_grad, ops::LSTMGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LSTMGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c48495533cf5dbf1f46176cff936f7f988a3d48
--- /dev/null
+++ b/paddle/fluid/operators/lstm_op.h
@@ -0,0 +1,376 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/lstm_compute.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence2batch.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
+                             const framework::Tensor& src,
+                             framework::Vector<size_t> index_lod,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  row_shuffle(ctx, src, index_lod, *dst, indexed_src);
+}
+
+template <typename DeviceContext, typename T>
+class LSTMKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+
+    auto* hidden_t0 = ctx.Input<Tensor>("H0");
+    auto* cell_t0 = ctx.Input<Tensor>("C0");
+
+    auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(ctx.GetPlace());
+    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
+    hidden_out->mutable_data<T>(ctx.GetPlace());
+    auto* cell_out = ctx.Output<LoDTensor>("Cell");
+    cell_out->mutable_data<T>(ctx.GetPlace());
+
+    bool is_reverse = ctx.Attr<bool>("is_reverse");
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
+
+    auto in_dims = input->dims();
+    int frame_size = static_cast<int>(in_dims[1] / 4);
+    framework::DDim dims({in_dims[0], frame_size});
+
+    if (bias) {
+      Tensor b = *bias;
+      b.Resize({bias->numel(), 1});
+      Tensor gate_bias = b.Slice(0, 4 * frame_size);
+      math::RowwiseAdd<DeviceContext, T> add_bias;
+      add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
+    }
+
+    math::LstmMetaValue<T> lstm_value;
+    if (bias && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_data = const_cast<T*>(bias->data<T>());
+      // the code style in LstmMetaValue will be updated later.
+
+      lstm_value.check_ig = bias_data + 4 * frame_size;
+      lstm_value.check_fg = lstm_value.check_ig + frame_size;
+      lstm_value.check_og = lstm_value.check_fg + frame_size;
+    } else {
+      lstm_value.check_ig = nullptr;
+      lstm_value.check_fg = nullptr;
+      lstm_value.check_og = nullptr;
+    }
+    lstm_value.prev_state_value = nullptr;
+    Tensor ordered_c0;
+
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
+    if (cell_t0) {
+      // Since the batch computing for LSTM reorders the input sequence
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<DeviceContext, T>(device_ctx, *cell_t0, order,
+                                         &ordered_c0, true);
+      lstm_value.prev_state_value = ordered_c0.data<T>();
+    }
+
+    // Use the local variable as here.
+    LoDTensor batch_hidden, batch_cell;
+    auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
+    batch_hidden.mutable_data<T>(dims, ctx.GetPlace());
+    batch_cell.mutable_data<T>(dims, ctx.GetPlace());
+    batch_cell_pre_act->mutable_data<T>(dims, ctx.GetPlace());
+
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
+
+    for (size_t n = 0; n < num_batch; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      Tensor out_t = batch_hidden.Slice(bstart, bend);
+      Tensor cell_t = batch_cell.Slice(bstart, bend);
+      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
+
+      int cur_batch_size = bend - bstart;
+
+      if (n > 0) {
+        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+        int pre_h_end = pre_h_start + cur_batch_size;
+        auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
+        math::matmul<DeviceContext, T>(device_ctx, pre_hidden_t, false, *weight,
+                                       false, static_cast<T>(1.0), &gate_t,
+                                       static_cast<T>(1.0));
+      } else if (hidden_t0) {
+        // If n == 0 and there is no initialized hidden state, that is to say
+        // the H0 is zeros, the calculation W_h * H0 will be skiped.
+        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
+
+        // Since the batch computing for LSTM reorders the input sequence
+        // according to their length. The initialized hidden state also needs
+        // to reorder.
+        Tensor ordered_h0;
+        ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
+                                           &ordered_h0, true);
+        math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false, *weight,
+                                       false, static_cast<T>(1.0), &gate_t,
+                                       static_cast<T>(1.0));
+      }
+
+      lstm_value.gate_value = gate_t.data<T>();
+      lstm_value.output_value = out_t.data<T>();
+      lstm_value.state_value = cell_t.data<T>();
+      lstm_value.state_active_value = cell_pre_act_t.data<T>();
+      math::LstmUnitFunctor<DeviceContext, T>::compute(
+          device_ctx, lstm_value, frame_size, cur_batch_size, gate_act,
+          cell_act, cand_act);
+      lstm_value.prev_state_value = lstm_value.state_value;
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batch_hidden.set_lod(batch_gate->lod());
+    // restore the output hidden in LoDTensor from the batch hidden
+    to_seq(device_ctx, batch_hidden, *hidden_out);
+
+    batch_cell.set_lod(batch_gate->lod());
+    // restore the output cell state in LoDTensor from the batch cell
+    to_seq(device_ctx, batch_cell, *cell_out);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LSTMGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+
+    auto* hidden_out = ctx.Input<LoDTensor>("Hidden");
+    auto* cell_out = ctx.Input<LoDTensor>("Cell");
+
+    auto* batch_gate = ctx.Input<LoDTensor>("BatchGate");
+    auto* batch_cell_pre_act = ctx.Input<LoDTensor>("BatchCellPreAct");
+
+    auto* hidden_g = ctx.Input<LoDTensor>(framework::GradVarName("Hidden"));
+
+    auto* in_g = ctx.Output<LoDTensor>(framework::GradVarName("Input"));
+    auto* weight_g = ctx.Output<Tensor>(framework::GradVarName("Weight"));
+    auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto* h0 = ctx.Input<Tensor>("H0");
+    auto* c0 = ctx.Input<Tensor>("C0");
+
+    auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
+    auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
+
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    if (weight_g) {
+      weight_g->mutable_data<T>(ctx.GetPlace());
+      zero(device_ctx, weight_g, static_cast<T>(0.0));
+    }
+
+    // ordered_h0/c0 is the reordered hidden/cell initialization.
+    // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
+    // initialization.
+    Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
+    if (c0) {
+      ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
+                                         true);
+    }
+    if (c0 && c0_g) {
+      ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
+    }
+
+    auto in_dims = input->dims();
+    auto out_dims = hidden_g->dims();
+    int frame_size = static_cast<int>(in_dims[1] / 4);
+    PADDLE_ENFORCE_EQ(frame_size, out_dims[1]);
+
+    math::LstmMetaValue<T> lstm_value;
+    if (bias && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_data = const_cast<T*>(bias->data<T>());
+      lstm_value.check_ig = bias_data + 4 * frame_size;
+      lstm_value.check_fg = lstm_value.check_ig + frame_size;
+      lstm_value.check_og = lstm_value.check_fg + frame_size;
+    } else {
+      lstm_value.check_ig = nullptr;
+      lstm_value.check_fg = nullptr;
+      lstm_value.check_og = nullptr;
+    }
+
+    math::LstmMetaGrad<T> lstm_grad;
+
+    if (bias && bias_g) {
+      bias_g->mutable_data<T>(ctx.GetPlace());
+      zero(device_ctx, bias_g, static_cast<T>(0.0));
+    }
+    if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_g_data = bias_g->data<T>();
+      lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size;
+      lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size;
+      lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size;
+    } else {
+      lstm_grad.check_ig_grad = nullptr;
+      lstm_grad.check_fg_grad = nullptr;
+      lstm_grad.check_og_grad = nullptr;
+    }
+
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+
+    auto ToBatch = [&batch_gate, &to_batch](
+        const DeviceContext& ctx, const framework::LoDTensor& src,
+        const framework::DDim& dims, framework::LoDTensor& dst) {
+      dst.mutable_data<T>(dims, ctx.GetPlace());
+      dst.set_lod(batch_gate->lod());
+      to_batch(ctx, src, dst, false);
+    };
+
+    LoDTensor batch_hidden, batch_hidden_g, batch_cell;
+    ToBatch(device_ctx, *hidden_out, out_dims, batch_hidden);
+    ToBatch(device_ctx, *hidden_g, out_dims, batch_hidden_g);
+    ToBatch(device_ctx, *cell_out, out_dims, batch_cell);
+
+    LoDTensor batch_cell_g, batch_gate_g;
+    batch_cell_g.mutable_data<T>(out_dims, ctx.GetPlace());
+    // TODO(qingqing) support the case output cell has gradient.
+    // to_batch(device_ctx, *cell_g, batch_cell_g, false);
+    zero(device_ctx, &batch_cell_g, static_cast<T>(0.0));
+    batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
+    batch_gate_g.set_lod(batch_gate->lod());
+
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
+
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+
+      Tensor gate = batch_gate->Slice(bstart, bend);
+      Tensor cell = batch_cell.Slice(bstart, bend);
+      Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
+      lstm_value.gate_value = gate.data<T>();
+      lstm_value.state_value = cell.data<T>();
+      lstm_value.state_active_value = cell_pre_act.data<T>();
+
+      Tensor out_g = batch_hidden_g.Slice(bstart, bend);
+      Tensor gate_g = batch_gate_g.Slice(bstart, bend);
+      Tensor cell_g = batch_cell_g.Slice(bstart, bend);
+      lstm_grad.state_grad = cell_g.data<T>();
+      lstm_grad.gate_grad = gate_g.data<T>();
+      lstm_grad.output_grad = out_g.data<T>();
+
+      if (n > 0) {
+        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
+        Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
+        Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
+        lstm_value.prev_state_value = cell_pre.data<T>();
+        lstm_grad.prev_state_grad = cell_pre_g.data<T>();
+      } else {
+        lstm_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
+        lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
+      }
+
+      int cur_batch_size = bend - bstart;
+      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
+          device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
+          gate_act, cell_act, cand_act);
+
+      if (n > 0) {
+        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+        int pre_h_end = pre_h_start + cur_batch_size;
+        auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end);
+        math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, true,
+                                       static_cast<T>(1.0), &pre_hidden_g,
+                                       static_cast<T>(1.0));
+        if (weight_g) {
+          /* backward weight */
+          auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end);
+          math::matmul<DeviceContext, T>(device_ctx, pre_hidden, true, gate_g,
+                                         false, static_cast<T>(1.0), weight_g,
+                                         static_cast<T>(1.0));
+        }
+      } else {
+        if (h0 && weight_g) {
+          ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
+                                             &ordered_h0, true);
+          math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true, gate_g,
+                                         false, static_cast<T>(1.0), weight_g,
+                                         static_cast<T>(1.0));
+        }
+        if (h0 && h0_g) {
+          ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
+          math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight,
+                                         true, static_cast<T>(1.0),
+                                         &ordered_h0_g, static_cast<T>(0.0));
+        }
+      }
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    if (in_g) {
+      /* backward data */
+      in_g->mutable_data<T>(ctx.GetPlace());
+      to_seq(device_ctx, batch_gate_g, *in_g);
+    }
+    if (bias && bias_g) {
+      /* backward bias */
+      Tensor b_g = *bias_g;
+      b_g.Resize({bias_g->numel(), 1});
+      Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
+      math::ColwiseSum<DeviceContext, T> col_sum;
+      col_sum(device_ctx, batch_gate_g, &gate_bias_g);
+    }
+
+    if (h0 && h0_g) {
+      ReorderInitState<DeviceContext, T>(device_ctx, ordered_h0_g, order, h0_g,
+                                         false);
+    }
+    if (c0 && c0_g) {
+      ReorderInitState<DeviceContext, T>(device_ctx, ordered_c0_g, order, c0_g,
+                                         false);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d33d47e0c3ba8b83be2c06e9884d90e9bb1012e
--- /dev/null
+++ b/paddle/fluid/operators/lstm_unit_op.cc
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/lstm_unit_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LstmUnitOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("C_prev"),
+                   "Input(C_prev) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("C"),
+                   "Output(C) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("H"),
+                   "Output(H) of LSTM should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto c_prev_dims = ctx->GetInputDim("C_prev");
+
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], c_prev_dims[0],
+                      "Batch size of inputs and states must be equal");
+    PADDLE_ENFORCE_EQ(x_dims[1], c_prev_dims[1] * 4,
+                      "Dimension of FC should equal to prev state * 4");
+
+    int b_size = c_prev_dims[0];  // batch size
+    int s_dim = c_prev_dims[1];   // state dim
+    ctx->SetOutputDim("C", {b_size, s_dim});
+    ctx->SetOutputDim("H", {b_size, s_dim});
+  }
+};
+
+class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LstmUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "Lstm unit only applies non-linear activations, please make sure"
+             "that linear tranformation has already been applied to `X`. "
+             "Linear tranformation can be applied by adding a `fc` layer");
+    AddInput(
+        "C_prev",
+        "The cell state tensor of last time-step in the Lstm Unit operator.");
+    AddOutput("C", "The cell tensor of Lstm Unit operator.");
+    AddOutput("H", "The hidden state tensor of Lstm Unit operator.");
+    AddAttr<float>("forget_bias",
+                   "(float, default 0.0) "
+                   "The forget bias of Lstm Unit.")
+        .SetDefault(0.0);
+    AddComment(R"DOC(
+Lstm Unit Operator
+
+Equation:
+
+$$
+i, f, o, j = split(X) \\
+C = C_{prev} * sigm(f + forget\_bias) + sigm(i) * tanh(j) \\
+H = C * sigm(o)
+$$
+
+)DOC");
+  }
+};
+
+class LstmUnitGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("C")),
+                   "Input(C@GRAD) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("H")),
+                   "Input(H@GRAD) should not be null");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->SetOutputDim(framework::GradVarName("C_prev"),
+                      ctx->GetInputDim("C_prev"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker, lstm_unit_grad,
+            ops::LstmUnitGradOp);
+REGISTER_OP_CPU_KERNEL(lstm_unit,
+                       ops::LstmUnitKernel<paddle::platform::CPUPlace, float>,
+                       ops::LstmUnitKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    lstm_unit_grad, ops::LstmUnitGradKernel<paddle::platform::CPUPlace, float>,
+    ops::LstmUnitGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..12ebffca37f995111bbeb2a1e8b30ea2fe35c74d
--- /dev/null
+++ b/paddle/fluid/operators/lstm_unit_op.cu
@@ -0,0 +1,179 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/* Acknowledgement: the following code is strongly inspired by
+https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.cu
+*/
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/cross_entropy_op.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename Dtype>
+__device__ Dtype cuda_sigmoid(const Dtype x) {
+  return Dtype(1) / (Dtype(1) + exp(-x));
+}
+
+template <typename Dtype>
+__device__ Dtype cuda_tanh(const Dtype x) {
+  return Dtype(1 - exp(-2. * x)) / (Dtype(1) + exp(-2. * x));
+}
+
+template <typename T>
+__global__ void LSTMUnitKernel(const int nthreads, const int dim,
+                               const T* C_prev, const T* X, T* C, T* H,
+                               const T forget_bias) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+
+    const T* X_offset = X + 4 * dim * n;
+    const T i = cuda_sigmoid(X_offset[d]);
+    const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias);
+    const T o = cuda_sigmoid(X_offset[2 * dim + d]);
+    const T g = cuda_tanh(X_offset[3 * dim + d]);
+    const T c_prev = C_prev[index];
+    const T c = f * c_prev + i * g;
+    C[index] = c;
+    const T tanh_c = cuda_tanh(c);
+    H[index] = o * tanh_c;
+  }
+}
+
+template <typename T>
+__global__ void LSTMUnitGradientKernel(const int nthreads, const int dim,
+                                       const T* C_prev, const T* X, const T* C,
+                                       const T* H, const T* C_diff,
+                                       const T* H_diff, T* C_prev_diff,
+                                       T* X_diff, const T forget_bias) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+    const T* X_offset = X + 4 * dim * n;
+    T* c_prev_diff = C_prev_diff + index;
+    T* X_diff_offset = X_diff + 4 * dim * n;
+    T* i_diff = X_diff_offset + d;
+    T* f_diff = X_diff_offset + 1 * dim + d;
+    T* o_diff = X_diff_offset + 2 * dim + d;
+    T* g_diff = X_diff_offset + 3 * dim + d;
+
+    const T i = cuda_sigmoid(X_offset[d]);
+    const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias);
+    const T o = cuda_sigmoid(X_offset[2 * dim + d]);
+    const T g = cuda_tanh(X_offset[3 * dim + d]);
+    const T c_prev = C_prev[index];
+    const T c = C[index];
+    const T tanh_c = cuda_tanh(c);
+    const T c_term_diff =
+        C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c);
+    *c_prev_diff = c_term_diff * f;
+    *i_diff = c_term_diff * g * i * (1 - i);
+    *f_diff = c_term_diff * c_prev * f * (1 - f);
+    *o_diff = H_diff[index] * tanh_c * o * (1 - o);
+    *g_diff = c_term_diff * i * (1 - g * g);
+  }
+}
+
+template <typename T>
+class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+
+    auto* x_tensor = ctx.Input<framework::Tensor>("X");
+    auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
+    auto* c_tensor = ctx.Output<framework::Tensor>("C");
+    auto* h_tensor = ctx.Output<framework::Tensor>("H");
+
+    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
+
+    int b_size = c_tensor->dims()[0];
+    int D = c_tensor->dims()[1];
+
+    const T* X = x_tensor->data<T>();
+    const T* C_prev = c_prev_tensor->data<T>();
+
+    T* C = c_tensor->mutable_data<T>(ctx.GetPlace());
+    T* H = h_tensor->mutable_data<T>(ctx.GetPlace());
+
+    int block = 512;
+    int n = b_size * D;
+    int grid = (n + block - 1) / block;
+
+    LSTMUnitKernel<T><<<grid, block>>>(n, D, C_prev, X, C, H, forget_bias);
+  }
+};
+
+template <typename T>
+class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+
+    auto x_tensor = ctx.Input<Tensor>("X");
+    auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
+    auto c_tensor = ctx.Input<Tensor>("C");
+    auto h_tensor = ctx.Input<Tensor>("H");
+
+    auto hdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("H"));
+    auto cdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("C"));
+
+    auto xdiff_tensor = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto c_prev_diff_tensor =
+        ctx.Output<Tensor>(framework::GradVarName("C_prev"));
+
+    auto* X = x_tensor->data<T>();
+    auto* C_prev = c_prev_tensor->data<T>();
+    auto* C = c_tensor->data<T>();
+    auto* H = h_tensor->data<T>();
+
+    auto* H_diff = hdiff_tensor->data<T>();
+    auto* C_diff = cdiff_tensor->data<T>();
+
+    auto* C_prev_diff = c_prev_diff_tensor->mutable_data<T>(ctx.GetPlace());
+    auto* X_diff = xdiff_tensor->mutable_data<T>(ctx.GetPlace());
+
+    int N = c_tensor->dims()[0];
+    int D = c_tensor->dims()[1];
+
+    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
+
+    int block = 512;
+    int n = N * D;
+    int grid = (n + block - 1) / block;
+
+    LSTMUnitGradientKernel<T><<<grid, block>>>(n, D, C_prev, X, C, H, C_diff,
+                                               H_diff, C_prev_diff, X_diff,
+                                               forget_bias);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>,
+                        ops::LstmUnitOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>,
+                        ops::LstmUnitGradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/lstm_unit_op.h b/paddle/fluid/operators/lstm_unit_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f2370fe690a45f49c0138fbd1303d7bfd6dacd0
--- /dev/null
+++ b/paddle/fluid/operators/lstm_unit_op.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/* Acknowledgement: the following code is strongly inspired by
+https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op.h
+*/
+
+#pragma once
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+inline T sigmoid(T x) {
+  return 1. / (1. + exp(-x));
+}
+
+template <typename T>
+inline T tanh(T x) {
+  return 2. * sigmoid(2. * x) - 1.;
+}
+
+template <typename DeviceContext, typename T>
+class LstmUnitKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto* x_tensor = ctx.Input<framework::Tensor>("X");
+    auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
+    auto* c_tensor = ctx.Output<framework::Tensor>("C");
+    auto* h_tensor = ctx.Output<framework::Tensor>("H");
+
+    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
+
+    int b_size = c_tensor->dims()[0];
+    int D = c_tensor->dims()[1];
+
+    T* C = c_tensor->mutable_data<T>(ctx.GetPlace());
+    T* H = h_tensor->mutable_data<T>(ctx.GetPlace());
+
+    const T* X = x_tensor->data<T>();
+    const T* C_prev = c_prev_tensor->data<T>();
+
+    for (int n = 0; n < b_size; ++n) {
+      for (int d = 0; d < D; ++d) {
+        const T i = sigmoid(X[d]);
+        const T f = sigmoid(X[1 * D + d] + forget_bias);
+        const T o = sigmoid(X[2 * D + d]);
+        const T g = tanh(X[3 * D + d]);
+        const T c_prev = C_prev[d];
+        const T c = f * c_prev + i * g;
+        C[d] = c;
+        const T tanh_c = tanh(c);
+        H[d] = o * tanh_c;
+      }
+      C_prev += D;
+      X += 4 * D;
+      C += D;
+      H += D;
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LstmUnitGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto x_tensor = ctx.Input<Tensor>("X");
+    auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
+    auto c_tensor = ctx.Input<Tensor>("C");
+    auto h_tensor = ctx.Input<Tensor>("H");
+
+    auto hdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("H"));
+    auto cdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("C"));
+
+    auto xdiff_tensor = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto c_prev_diff_tensor =
+        ctx.Output<Tensor>(framework::GradVarName("C_prev"));
+
+    auto* X = x_tensor->data<T>();
+    auto* C_prev = c_prev_tensor->data<T>();
+    auto* C = c_tensor->data<T>();
+    auto* H = h_tensor->data<T>();
+
+    auto* H_diff = hdiff_tensor->data<T>();
+    auto* C_diff = cdiff_tensor->data<T>();
+
+    auto* C_prev_diff = c_prev_diff_tensor->mutable_data<T>(ctx.GetPlace());
+    auto* X_diff = xdiff_tensor->mutable_data<T>(ctx.GetPlace());
+
+    int N = c_tensor->dims()[0];
+    int D = c_tensor->dims()[1];
+
+    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
+
+    for (int n = 0; n < N; ++n) {
+      for (int d = 0; d < D; ++d) {
+        T* c_prev_diff = C_prev_diff + d;
+        T* i_diff = X_diff + d;
+        T* f_diff = X_diff + 1 * D + d;
+        T* o_diff = X_diff + 2 * D + d;
+        T* g_diff = X_diff + 3 * D + d;
+
+        const T i = sigmoid(X[d]);
+        const T f = sigmoid(X[1 * D + d] + forget_bias);
+        const T o = sigmoid(X[2 * D + d]);
+        const T g = tanh(X[3 * D + d]);
+        const T c_prev = C_prev[d];
+        const T c = C[d];
+        const T tanh_c = tanh(c);
+        const T c_term_diff = C_diff[d] + H_diff[d] * o * (1 - tanh_c * tanh_c);
+        *c_prev_diff = c_term_diff * f;
+        *i_diff = c_term_diff * g * i * (1 - i);
+        *f_diff = c_term_diff * c_prev * f * (1 - f);
+        *o_diff = H_diff[d] * tanh_c * o * (1 - o);
+        *g_diff = c_term_diff * i * (1 - g * g);
+      }
+      C_prev += D;
+      X += 4 * D;
+      C += D;
+      H += D;
+      C_diff += D;
+      H_diff += D;
+      X_diff += 4 * D;
+      C_prev_diff += D;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2d30edf5c3cbabe7223a459a5f60b7b9aa51af9a
--- /dev/null
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -0,0 +1,331 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/lstmp_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LSTMPOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ProjWeight"),
+                   "Input(ProjWeight) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of LSTMP operator should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("Projection"),
+                   "Output(Projection) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Cell"),
+                   "Output(Cell) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
+                   "Output(BatchGate) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
+                   "Output(BatchCellPreAct) of LSTMP operator should not be "
+                   "null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
+                   "Output(BatchHidden) of LSTMP operator should not be null.");
+
+    auto in_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2,
+                      "Input(X)'s rank of LSTMP operator must be 2.");
+
+    int frame_size = in_dims[1] / 4;
+    auto w_dims = ctx->GetInputDim("Weight");
+    auto proj_dims = ctx->GetInputDim("ProjWeight");
+    PADDLE_ENFORCE_EQ(w_dims.size(), 2,
+                      "The rank of Input(Weight) should be 2.");
+    PADDLE_ENFORCE_EQ(w_dims[0], proj_dims[1],
+                      "The first dimension of Input(Weight) "
+                      "should be %d.",
+                      proj_dims[1]);
+    PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size,
+                      "The second dimension of Input(Weight) "
+                      "should be 4 * %d.",
+                      frame_size);
+
+    PADDLE_ENFORCE_EQ(proj_dims.size(), 2,
+                      "The rank of Input(ProjWeight) should be 2.");
+    PADDLE_ENFORCE_EQ(proj_dims[0], frame_size,
+                      "The first dimension of Input(ProjWeight) "
+                      "should be %d.",
+                      frame_size);
+
+    if (ctx->HasInput("H0")) {
+      PADDLE_ENFORCE(ctx->HasInput("C0"),
+                     "Input(C0) of LSTMP operator should not be null after "
+                     "Input(H0) provided.");
+      auto h_dims = ctx->GetInputDim("H0");
+      auto c_dims = ctx->GetInputDim("C0");
+      PADDLE_ENFORCE(h_dims == c_dims,
+                     "The dimension of Input(H0) and Input(C0) "
+                     "should be the same.");
+      ctx->SetOutputDim("OrderedP0", {h_dims[0], proj_dims[1]});
+    }
+
+    auto b_dims = ctx->GetInputDim("Bias");
+    PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
+    PADDLE_ENFORCE_EQ(b_dims[0], 1,
+                      "The first dimension of Input(Bias) should be 1.");
+
+    if (ctx->Attrs().Get<bool>("use_peepholes")) {
+      PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
+                        "The second dimension of Input(Bias) should be "
+                        "7 * %d if enable peepholes connection",
+                        frame_size);
+    } else {
+      PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
+                        "The second dimension of Input(Bias) should be "
+                        "4 * %d if disable peepholes connection",
+                        frame_size);
+    }
+
+    framework::DDim out_dims({in_dims[0], frame_size});
+    framework::DDim proj_out_dims({in_dims[0], proj_dims[1]});
+    ctx->SetOutputDim("Projection", proj_out_dims);
+    ctx->SetOutputDim("Cell", out_dims);
+    ctx->SetOutputDim("BatchGate", in_dims);
+    ctx->SetOutputDim("BatchCellPreAct", out_dims);
+    ctx->SetOutputDim("BatchHidden", out_dims);
+    ctx->ShareLoD("Input", "Projection");
+    ctx->ShareLoD("Input", "Cell");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LSTMPOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(LoDTensor) the input for sequence data, which supports "
+             "variable-time length input sequence. The underlying tensor in "
+             "this LoDTensor is a matrix with shape (T X 4D), where T is the "
+             "total time steps in this mini-batch, D is the hidden size.");
+    AddInput("H0",
+             "(Tensor, optional) the initial hidden state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size and D is the hidden size.")
+        .AsDispensable();
+    AddInput("C0",
+             "(Tensor, optional) the initial cell state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size. `C0` should not be null if `H0` provided.")
+        .AsDispensable();
+    AddInput("Weight",
+             "(Tensor) the learnable hidden-hidden weights."
+             " - The shape is (P x 4D), where P is the projection layer size "
+             "and  D is the hidden size."
+             " - Weight = {W_cr, W_ir, W_fr, W_or}");
+    AddInput("ProjWeight",
+             "(Tensor) the learnable weight of the projection layer."
+             " - The shape is (D x P), where P is the recurrent projection "
+             "layer size and  D is the hidden size."
+             " - ProjWeight = {W_rh}");
+    AddInput("Bias",
+             "(Tensor) the learnable biases, which contains two parts: "
+             "input-hidden biases and peephole connections weights if "
+             "setting `use_peepholes` to `True`. "
+             "1. `use_peepholes = False` "
+             " - The shape is (1 x 4D). "
+             " - Bias = {b_c, b_i, b_f, b_o}."
+             "2. `use_peepholes = True` "
+             " - The shape is (1 x 7D). "
+             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
+    AddOutput("Projection",
+              "(LoDTensor) the projection of the hidden state of LSTMP "
+              "operator. The shape is (T x P), and LoD is the same with the "
+              "`Input`.");
+    AddOutput("Cell",
+              "(LoDTensor) the cell state of LSTMP operator. "
+              "The shape is (T x D), and lod is the same with the `Input`.");
+    AddOutput("BatchGate",
+              "(LoDTensor) This LoDTensor contains input gate, forget gate "
+              "and output gate after the activations. This LoDTensor has the "
+              "same shape as the reorganized input, which is also be called "
+              "batch input. The LoD size is 2. The first-level LoD is the "
+              "batch offsets and the second contains the indices, which "
+              "denotes the position of reorganized sequence in the raw input.")
+        .AsIntermediate();
+    AddOutput("BatchCellPreAct",
+              "(LoDTensor) the pre-activation cell state reorganized in batch. "
+              "This LoDTensor is obtained in the forward and used in the "
+              "backward.")
+        .AsIntermediate();
+    AddOutput("BatchHidden",
+              "(LoDTensor) the hidden state reorganized in batch. "
+              "This LoDTensor is obtained in the forward and used in the "
+              "backward.")
+        .AsIntermediate();
+    AddOutput("OrderedP0",
+              "(Tensor) the projection of the initial hidden state "
+              "H0. This is a tensor with shape (N x P), where N is the "
+              "batch size and P is the hidden size.")
+        .AsIntermediate();
+    AddAttr<bool>("use_peepholes",
+                  "(bool, defalut: True) "
+                  "whether to enable diagonal/peephole connections.")
+        .SetDefault(true);
+    AddAttr<bool>("is_reverse",
+                  "(bool, defalut: False) "
+                  "whether to compute reversed LSTMP.")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "gate_activation",
+        "(string, default: sigmoid)"
+        "The activation for input gate, forget gate and output "
+        "gate, `sigmoid` by default.")
+        .SetDefault("sigmoid")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("cell_activation",
+                         "(string, default: tanh)"
+                         "The activation for cell output, `tanh` by defalut.")
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("candidate_activation",
+                         "(string, default: tanh)"
+                         "The activation for candidate hidden state, "
+                         "`tanh` by default.")
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("proj_activation",
+                         "(string, default: tanh)"
+                         "The activation for projection output, "
+                         "`tanh` by defalut.")
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddComment(R"DOC(
+Long-Short Term Memory with recurrent Projection layer (LSTMP) Operator.
+
+LSTMP has a separate projection layer after the LSTM layer, projecting the 
+original hidden state to a lower-dimensional one, which is proposed to reduce 
+the number of total parameters and furthermore computational complexity for 
+the LSTM, espeacially for the case that the size of output units is relative 
+large (https://research.google.com/pubs/archive/43905.pdf). 
+
+The formula is as follows:
+
+$$
+i_t = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i) \\
+
+f_t = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f) \\
+
+\tilde{c_t} = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c) \\
+
+o_t = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o) \\
+
+c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
+
+h_t = o_t \odot act_h(c_t) \\
+
+r_t = \overline{act_h}(W_{rh}h_t)
+$$
+
+where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
+of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
+are diagonal weight matrices for peephole connections. In our implementation,
+we use vectors to reprenset these diagonal weight matrices. The b terms
+denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
+is the activation, such as logistic sigmoid function, and
+$i, f, o$ and $c$ are the input gate, forget gate, output gate,
+and cell activation vectors, respectively, all of which have the same size as
+the cell output activation vector $h$. Here $h$ is usually called the hidden 
+state and $r$ denotes its recurrent projection. And $\tilde{c_t}$ is also 
+called the candidate hidden state, whose computation is based on the current 
+input and previous hidden state.
+
+The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
+are the cell input and cell output activation functions and `tanh` is usually
+used for them. $\overline{act_h}$ is the activation function for the 
+projection output, usually using `identity` or same as $act_h$.
+
+Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
+operations on the input $x_{t}$ are NOT included in this operator.
+Users can choose to use fully-connected operator before LSTMP operator.
+
+)DOC");
+  }
+};
+
+class LSTMPGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Projection"),
+                   "Input(Projection) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Cell"),
+                   "Input(Cell) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ProjWeight"),
+                   "Input(ProjWeight) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of LSTMP operator should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
+                   "Input(BatchGate) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"),
+                   "Input(BatchGate) of LSTMP operator should not be null.");
+
+    auto SetOutGradDim = [&ctx](const std::string& name) {
+      auto g_name = framework::GradVarName(name);
+      if (ctx->HasOutput(g_name))
+        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
+    };
+
+    SetOutGradDim("Input");
+    SetOutGradDim("Weight");
+    SetOutGradDim("ProjWeight");
+    SetOutGradDim("Bias");
+    SetOutGradDim("H0");
+    SetOutGradDim("C0");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, lstmp_grad,
+            ops::LSTMPGradOp);
+REGISTER_OP_CPU_KERNEL(
+    lstmp, ops::LSTMPKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LSTMPKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    lstmp_grad, ops::LSTMPGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LSTMPGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/lstmp_op.cu b/paddle/fluid/operators/lstmp_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bcefb94c75b8577fefb1ee3b440dc5fb045562d5
--- /dev/null
+++ b/paddle/fluid/operators/lstmp_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/lstmp_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    lstmp, ops::LSTMPKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LSTMPKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lstmp_grad,
+    ops::LSTMPGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LSTMPGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..22ef4721860a493fded98cf32b40a2aceb851a5c
--- /dev/null
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -0,0 +1,496 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/lstm_compute.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence2batch.h"
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
+                             const framework::Tensor& src,
+                             framework::Vector<size_t> index,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  row_shuffle(ctx, src, index, *dst, indexed_src);
+}
+
+template <typename DeviceContext, typename T>
+class LSTMPKernel : public framework::OpKernel<T> {
+ public:
+  template <typename Device, typename X, typename Y>
+  void ActCompute(const math::detail::ActivationType act_type, const Device& d,
+                  X x, Y y) const {
+    if (act_type == math::detail::ActivationType::kIdentity)
+      y.device(d) = x;
+    else if (act_type == math::detail::ActivationType::kSigmoid)
+      SigmoidFunctor<T>()(d, x, y);
+    else if (act_type == math::detail::ActivationType::kTanh)
+      TanhFunctor<T>()(d, x, y);
+    else if (act_type == math::detail::ActivationType::kReLU)
+      ReluFunctor<T>()(d, x, y);
+    else
+      PADDLE_THROW("unsupported activation type");
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+
+    auto* hidden_t0 = ctx.Input<Tensor>("H0");
+    auto* ordered_proj0 = ctx.Output<Tensor>("OrderedP0");
+    auto* cell_t0 = ctx.Input<Tensor>("C0");
+
+    auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(ctx.GetPlace());
+    auto* proj_out = ctx.Output<LoDTensor>("Projection");
+    proj_out->mutable_data<T>(ctx.GetPlace());
+    auto* cell_out = ctx.Output<LoDTensor>("Cell");
+    cell_out->mutable_data<T>(ctx.GetPlace());
+
+    bool is_reverse = ctx.Attr<bool>("is_reverse");
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
+
+    auto in_dims = input->dims();
+    int frame_size = static_cast<int>(in_dims[1] / 4);
+    framework::DDim dims({in_dims[0], frame_size});
+    framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
+
+    if (bias) {
+      Tensor b = *bias;
+      b.Resize({bias->numel(), 1});
+      Tensor gate_bias = b.Slice(0, 4 * frame_size);
+      math::RowwiseAdd<DeviceContext, T> add_bias;
+      add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
+    }
+
+    math::LstmMetaValue<T> lstmp_value;
+    if (bias && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_data = const_cast<T*>(bias->data<T>());
+      // the code style in LstmpMetaValue will be updated later.
+
+      lstmp_value.check_ig = bias_data + 4 * frame_size;
+      lstmp_value.check_fg = lstmp_value.check_ig + frame_size;
+      lstmp_value.check_og = lstmp_value.check_fg + frame_size;
+    } else {
+      lstmp_value.check_ig = nullptr;
+      lstmp_value.check_fg = nullptr;
+      lstmp_value.check_og = nullptr;
+    }
+    lstmp_value.prev_state_value = nullptr;
+    Tensor ordered_c0;
+
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
+    if (cell_t0) {
+      // Since the batch computing for LSTMP reorders the input sequence
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<DeviceContext, T>(device_ctx, *cell_t0, order,
+                                         &ordered_c0, true);
+      lstmp_value.prev_state_value = ordered_c0.data<T>();
+    }
+
+    // Use the local variable as here.
+    LoDTensor batch_proj, batch_cell;
+    auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
+    batch_cell_pre_act->mutable_data<T>(dims, ctx.GetPlace());
+    auto* batch_hidden = ctx.Output<LoDTensor>("BatchHidden");
+    batch_hidden->mutable_data<T>(dims, ctx.GetPlace());    // T x D
+    batch_proj.mutable_data<T>(proj_dims, ctx.GetPlace());  // T x P
+    batch_cell.mutable_data<T>(dims, ctx.GetPlace());       // T x D
+
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
+    auto proj_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("proj_activation"));
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    for (size_t n = 0; n < num_batch; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+      Tensor proj_t = batch_proj.Slice(bstart, bend);
+      Tensor cell_t = batch_cell.Slice(bstart, bend);
+      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
+
+      int cur_batch_size = bend - bstart;
+
+      if (n > 0) {
+        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+        int pre_h_end = pre_h_start + cur_batch_size;
+        auto pre_proj_t = batch_proj.Slice(pre_h_start, pre_h_end);
+        math::matmul<DeviceContext, T>(device_ctx, pre_proj_t, false, *weight,
+                                       false, static_cast<T>(1.0), &gate_t,
+                                       static_cast<T>(1.0));
+      } else if (hidden_t0) {
+        // If n == 0 and there is no initialized hidden state, that is to say
+        // the H0 is zeros, the calculation W_h * H0 will be skiped.
+        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
+
+        // Since the batch computing for LSTMP reorders the input sequence
+        // according to their length. The initialized hidden state also needs
+        // to reorder.
+
+        Tensor ordered_h0;
+        ordered_proj0->mutable_data<T>(ctx.GetPlace());
+        ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
+                                           &ordered_h0, true);
+        math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false,
+                                       *proj_weight, false, static_cast<T>(1.0),
+                                       ordered_proj0, static_cast<T>(0.0));
+        if (proj_act != math::detail::ActivationType::kIdentity) {
+          auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
+          ActCompute(cell_act, place, proj0_dev, proj0_dev);
+        }
+        math::matmul<DeviceContext, T>(device_ctx, *ordered_proj0, false,
+                                       *weight, false, static_cast<T>(1.0),
+                                       &gate_t, static_cast<T>(1.0));
+      }
+
+      lstmp_value.gate_value = gate_t.data<T>();
+      lstmp_value.output_value = hidden_t.data<T>();
+      lstmp_value.state_value = cell_t.data<T>();
+      lstmp_value.state_active_value = cell_pre_act_t.data<T>();
+      math::LstmUnitFunctor<DeviceContext, T>::compute(
+          device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act,
+          cell_act, cand_act);
+      lstmp_value.prev_state_value = lstmp_value.state_value;
+      math::matmul<DeviceContext, T>(device_ctx, hidden_t, false, *proj_weight,
+                                     false, static_cast<T>(1.0), &proj_t,
+                                     static_cast<T>(0.0));
+      if (proj_act != math::detail::ActivationType::kIdentity) {
+        auto proj_t_dev = EigenMatrix<T>::From(proj_t);
+        ActCompute(cell_act, place, proj_t_dev, proj_t_dev);
+      }
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batch_proj.set_lod(batch_gate->lod());
+    // restore the output hidden in LoDTensor from the batch hidden
+    to_seq(device_ctx, batch_proj, *proj_out);
+
+    batch_cell.set_lod(batch_gate->lod());
+    // restore the output cell state in LoDTensor from the batch cell
+    to_seq(device_ctx, batch_cell, *cell_out);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LSTMPGradKernel : public framework::OpKernel<T> {
+ public:
+  template <typename Device, typename X, typename Y, typename DX, typename DY>
+  void ActGradCompute(const math::detail::ActivationType act_type,
+                      const Device& d, X x, Y y, DX dx, DY dy) const {
+    // x is dummy and won't be used even in Relu(use y instead)
+    if (act_type == math::detail::ActivationType::kIdentity)
+      dx.device(d) = dy;
+    else if (act_type == math::detail::ActivationType::kSigmoid)
+      SigmoidGradFunctor<T>()(d, x, y, dy, dx);
+    else if (act_type == math::detail::ActivationType::kTanh)
+      TanhGradFunctor<T>()(d, x, y, dy, dx);
+    else if (act_type == math::detail::ActivationType::kReLU)
+      ReluGradFunctor<T>()(d, x, y, dy, dx);
+    else
+      PADDLE_THROW("unsupported activation type");
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+
+    auto* proj_out = ctx.Input<LoDTensor>("Projection");
+    auto* cell_out = ctx.Input<LoDTensor>("Cell");
+
+    auto* batch_gate = ctx.Input<LoDTensor>("BatchGate");
+    auto* batch_cell_pre_act = ctx.Input<LoDTensor>("BatchCellPreAct");
+    auto* batch_hidden = ctx.Input<LoDTensor>("BatchHidden");
+
+    auto* projection_g =
+        ctx.Input<LoDTensor>(framework::GradVarName("Projection"));
+
+    auto* in_g = ctx.Output<LoDTensor>(framework::GradVarName("Input"));
+    auto* weight_g = ctx.Output<Tensor>(framework::GradVarName("Weight"));
+    auto* proj_weight_g =
+        ctx.Output<Tensor>(framework::GradVarName("ProjWeight"));
+    auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto* h0 = ctx.Input<Tensor>("H0");
+    auto* ordered_proj0 = ctx.Input<Tensor>("OrderedP0");
+    auto* c0 = ctx.Input<Tensor>("C0");
+
+    auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
+    auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
+
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    if (weight_g) {
+      weight_g->mutable_data<T>(ctx.GetPlace());
+      zero(device_ctx, weight_g, static_cast<T>(0.0));
+    }
+    if (proj_weight_g) {
+      proj_weight_g->mutable_data<T>(ctx.GetPlace());
+      zero(device_ctx, proj_weight_g, static_cast<T>(0.0));
+    }
+
+    // ordered_h0/c0 is the reordered hidden/cell initialization.
+    // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
+    // initialization.
+    Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
+
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
+    if (c0) {
+      ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
+                                         true);
+    }
+    if (c0 && c0_g) {
+      ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
+    }
+
+    auto in_dims = input->dims();
+    auto out_dims = cell_out->dims();
+    framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
+    int frame_size = static_cast<int>(in_dims[1] / 4);
+    PADDLE_ENFORCE_EQ(frame_size, out_dims[1]);
+
+    math::LstmMetaValue<T> lstmp_value;
+    if (bias && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_data = const_cast<T*>(bias->data<T>());
+      lstmp_value.check_ig = bias_data + 4 * frame_size;
+      lstmp_value.check_fg = lstmp_value.check_ig + frame_size;
+      lstmp_value.check_og = lstmp_value.check_fg + frame_size;
+    } else {
+      lstmp_value.check_ig = nullptr;
+      lstmp_value.check_fg = nullptr;
+      lstmp_value.check_og = nullptr;
+    }
+
+    math::LstmMetaGrad<T> lstmp_grad;
+
+    if (bias && bias_g) {
+      bias_g->mutable_data<T>(ctx.GetPlace());
+      zero(device_ctx, bias_g, static_cast<T>(0.0));
+    }
+    if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_g_data = bias_g->data<T>();
+      lstmp_grad.check_ig_grad = bias_g_data + 4 * frame_size;
+      lstmp_grad.check_fg_grad = lstmp_grad.check_ig_grad + frame_size;
+      lstmp_grad.check_og_grad = lstmp_grad.check_fg_grad + frame_size;
+    } else {
+      lstmp_grad.check_ig_grad = nullptr;
+      lstmp_grad.check_fg_grad = nullptr;
+      lstmp_grad.check_og_grad = nullptr;
+    }
+
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+
+    auto ToBatch = [&batch_gate, &to_batch](
+        const DeviceContext& ctx, const framework::LoDTensor& src,
+        const framework::DDim& dims, framework::LoDTensor& dst) {
+      dst.mutable_data<T>(dims, ctx.GetPlace());
+      dst.set_lod(batch_gate->lod());
+      to_batch(ctx, src, dst, false);
+    };
+
+    LoDTensor batch_hidden_g, batch_proj, batch_proj_g, batch_cell;
+    batch_hidden_g.mutable_data<T>(out_dims, ctx.GetPlace());
+    ToBatch(device_ctx, *proj_out, proj_dims, batch_proj);        // T x P
+    ToBatch(device_ctx, *projection_g, proj_dims, batch_proj_g);  // T x P
+    ToBatch(device_ctx, *cell_out, out_dims, batch_cell);         // T x D
+
+    LoDTensor batch_cell_g, batch_gate_g;
+    batch_cell_g.mutable_data<T>(out_dims, ctx.GetPlace());
+    // TODO(qingqing) support the case output cell has gradient.
+    // to_batch(device_ctx, *cell_g, batch_cell_g, false);
+    zero(device_ctx, &batch_cell_g, static_cast<T>(0.0));
+    batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
+    batch_gate_g.set_lod(batch_gate->lod());
+
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
+    auto proj_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("proj_activation"));
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+
+      Tensor cur_proj = batch_proj.Slice(bstart, bend);
+      Tensor proj_g = batch_proj_g.Slice(bstart, bend);
+      if (proj_act != math::detail::ActivationType::kIdentity) {
+        auto cur_proj_dev = EigenMatrix<T>::From(cur_proj);
+        auto proj_g_dev = EigenMatrix<T>::From(proj_g);
+        ActGradCompute(cell_act, place, cur_proj_dev, cur_proj_dev, proj_g_dev,
+                       proj_g_dev);
+      }
+      /* hidden state backwarad */
+      Tensor out_g = batch_hidden_g.Slice(bstart, bend);
+      math::matmul<DeviceContext, T>(device_ctx, proj_g, false, *proj_weight,
+                                     true, static_cast<T>(1.0), &out_g,
+                                     static_cast<T>(0.0));
+      /* projection weight backward*/
+      if (proj_weight_g) {
+        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+        math::matmul<DeviceContext, T>(device_ctx, hidden_t, true, proj_g,
+                                       false, static_cast<T>(1.0),
+                                       proj_weight_g, static_cast<T>(1.0));
+      }
+
+      Tensor gate = batch_gate->Slice(bstart, bend);
+      Tensor cell = batch_cell.Slice(bstart, bend);
+      Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
+      lstmp_value.gate_value = gate.data<T>();
+      lstmp_value.state_value = cell.data<T>();
+      lstmp_value.state_active_value = cell_pre_act.data<T>();
+
+      Tensor gate_g = batch_gate_g.Slice(bstart, bend);
+      Tensor cell_g = batch_cell_g.Slice(bstart, bend);
+      lstmp_grad.state_grad = cell_g.data<T>();
+      lstmp_grad.gate_grad = gate_g.data<T>();
+      lstmp_grad.output_grad = out_g.data<T>();
+
+      if (n > 0) {
+        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
+        Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
+        Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
+        lstmp_value.prev_state_value = cell_pre.data<T>();
+        lstmp_grad.prev_state_grad = cell_pre_g.data<T>();
+      } else {
+        lstmp_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
+        lstmp_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
+      }
+
+      int cur_batch_size = bend - bstart;
+      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
+          device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
+          gate_act, cell_act, cand_act);
+
+      if (n > 0) {
+        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+        int pre_h_end = pre_h_start + cur_batch_size;
+        auto pre_proj_g = batch_proj_g.Slice(pre_h_start, pre_h_end);
+        math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, true,
+                                       static_cast<T>(1.0), &pre_proj_g,
+                                       static_cast<T>(1.0));
+        if (weight_g) {
+          /* weight backward*/
+          auto pre_proj = batch_proj.Slice(pre_h_start, pre_h_end);
+          math::matmul<DeviceContext, T>(device_ctx, pre_proj, true, gate_g,
+                                         false, static_cast<T>(1.0), weight_g,
+                                         static_cast<T>(1.0));
+        }
+      } else {
+        if (h0 && weight_g) {
+          ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
+                                             &ordered_h0, true);
+          if (weight_g) {
+            math::matmul<DeviceContext, T>(device_ctx, *ordered_proj0, true,
+                                           gate_g, false, static_cast<T>(1.0),
+                                           weight_g, static_cast<T>(1.0));
+          }
+        }
+        if (h0 && (h0_g || proj_weight_g)) {
+          ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
+          Tensor proj0_g;
+          proj0_g.Resize({in_dims[0], proj_weight->dims()[1]});
+          proj0_g.mutable_data<T>(ctx.GetPlace());
+          math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight,
+                                         true, static_cast<T>(1.0), &proj0_g,
+                                         static_cast<T>(0.0));
+          if (proj_act != math::detail::ActivationType::kIdentity) {
+            auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
+            auto proj0_g_dev = EigenMatrix<T>::From(proj0_g);
+            ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev,
+                           proj0_g_dev);
+          }
+          if (h0_g) {
+            math::matmul<DeviceContext, T>(
+                device_ctx, proj0_g, false, *proj_weight, true,
+                static_cast<T>(1.0), &ordered_h0_g, static_cast<T>(0.0));
+          }
+          if (proj_weight_g) {
+            math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true,
+                                           proj0_g, false, static_cast<T>(1.0),
+                                           proj_weight_g, static_cast<T>(1.0));
+          }
+        }
+      }
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    if (in_g) {
+      /* backward data */
+      in_g->mutable_data<T>(ctx.GetPlace());
+      to_seq(device_ctx, batch_gate_g, *in_g);
+    }
+    if (bias && bias_g) {
+      /* backward bias */
+      Tensor b_g = *bias_g;
+      b_g.Resize({bias_g->numel(), 1});
+      Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
+      math::ColwiseSum<DeviceContext, T> col_sum;
+      col_sum(device_ctx, batch_gate_g, &gate_bias_g);
+    }
+
+    if (h0 && h0_g) {
+      ReorderInitState<DeviceContext, T>(device_ctx, ordered_h0_g, order, h0_g,
+                                         false);
+    }
+    if (c0 && c0_g) {
+      ReorderInitState<DeviceContext, T>(device_ctx, ordered_c0_g, order, c0_g,
+                                         false);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc31befb20526e84aae1804756d2d44a785aa229
--- /dev/null
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/margin_rank_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MarginRankLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // input check
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X1"), "Input(X1) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X2"), "Input(X2) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null.");
+    auto label_dims = ctx->GetInputDim("Label");
+    auto x1_dims = ctx->GetInputDim("X1");
+    auto x2_dims = ctx->GetInputDim("X2");
+    PADDLE_ENFORCE(
+        (label_dims == x1_dims) && (x1_dims == x2_dims) &&
+            (label_dims.size() == 2) && (label_dims[1] == 1),
+        "All inputs must be 2-D tensor with shape [batch_size x 1].");
+    ctx->SetOutputDim("Activated", label_dims);
+    ctx->SetOutputDim("Out", label_dims);
+  }
+};
+
+template <typename T>
+class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MarginRankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X1",
+             "(2-D tensor with shape [batch_size x 1]) The score for "
+             "one item X1 to be ranked, from pairwise ranking model.");
+    AddInput("X2",
+             "(2-D tensor with shape [batch_size x 1]) The score for "
+             "another item X2 to be ranked, from pairwise ranking model.");
+    AddInput("Label",
+             "(2-D tensor with shape [batch_size x 1]) "
+             "The label indicating X1 ranked higher than X2 or not, "
+             "can only be +1 or -1.");
+    AddOutput("Activated",
+              "(2-D tensor with shape [batch_size x 1]) Intermediate tensor "
+              "to indicate whether each element of Output(Out) is activated.")
+        .AsIntermediate();
+    AddOutput("Out",
+              "(2-D tensor with shape [batch_size x 1]) "
+              "The output loss of MarginRankLoss operator.");
+    AddAttr<T>("margin", "(scalar, default 0) Margin for MarginRankLossOp.")
+        .SetDefault(static_cast<T>(0));
+    AddComment(R"DOC(
+MarginRankLoss Operator.
+
+This operator measures the loss given a pair of training sample
+{`X1`, `X2`} and the `Label` with attribute `margin`, where `Label = +1` 
+indicating X1 is ranked higher than `X2` and `Label = -1` otherwise. The loss 
+is calculated as:
+
+$loss(X1, X2, Label) = \max(0, -Label * (X1 - X2) + margin)$
+
+The attribute `margin` here helps make the predictions more robust.
+Denote the item ranked higher as the positive sample, otherwise the negative 
+sample. If the score of the two samples satisfies 
+
+$positive sample - negative sample < margin$
+
+the pair of samples will contribute to the final loss, which will backpropagate 
+and train the ranking model to enlarge the difference between the two scores.
+
+For batch input with size `batch_size`, `X1`, `X2` and `Label`
+all have the same shape [batch_size x 1].
+
+)DOC");
+  }
+};
+
+class MarginRankLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X1"), "Input(X1) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X2"), "Input(X2) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Activated"),
+                   "Intermediate(Activated) shouldn't be null.");
+    auto dims = ctx->GetInputDim("Label");
+    ctx->SetOutputDim(framework::GradVarName("X1"), dims);
+    ctx->SetOutputDim(framework::GradVarName("X2"), dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OP(margin_rank_loss, ops::MarginRankLossOp,
+            ops::MarginRankLossOpMaker<float>, margin_rank_loss_grad,
+            ops::MarginRankLossGradOp);
+REGISTER_OP_CPU_KERNEL(
+    margin_rank_loss,
+    ops::MarginRankLossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    margin_rank_loss_grad,
+    ops::MarginRankLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cu b/paddle/fluid/operators/margin_rank_loss_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ca4593a48d6d3eccff81dfd621ea1198e5bad880
--- /dev/null
+++ b/paddle/fluid/operators/margin_rank_loss_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/margin_rank_loss_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    margin_rank_loss,
+    ops::MarginRankLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    margin_rank_loss_grad,
+    ops::MarginRankLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/margin_rank_loss_op.h b/paddle/fluid/operators/margin_rank_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..934a5da0f804f7cf7dc176a9ee4e1b72261ef008
--- /dev/null
+++ b/paddle/fluid/operators/margin_rank_loss_op.h
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct ReLU {
+  HOSTDEVICE T operator()(const T& val) const {
+    return val > 0 ? val : static_cast<T>(0);
+  }
+};
+
+template <typename T>
+struct Heaviside {
+  HOSTDEVICE T operator()(const T& val) const {
+    return static_cast<T>(val > 0 ? 1 : 0);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MarginRankLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out_t = ctx.Output<framework::Tensor>("Out");
+    auto* act_t = ctx.Output<framework::Tensor>("Activated");
+
+    auto* label_t = ctx.Input<framework::Tensor>("Label");
+    auto* x1_t = ctx.Input<framework::Tensor>("X1");
+    auto* x2_t = ctx.Input<framework::Tensor>("X2");
+
+    out_t->mutable_data<T>(ctx.GetPlace());
+    act_t->mutable_data<T>(ctx.GetPlace());
+
+    auto margin = static_cast<T>(ctx.Attr<T>("margin"));
+    auto out = framework::EigenVector<T>::Flatten(*out_t);
+    auto act = framework::EigenVector<T>::Flatten(*act_t);
+
+    auto label = framework::EigenVector<T>::Flatten(*label_t);
+    auto x1 = framework::EigenVector<T>::Flatten(*x1_t);
+    auto x2 = framework::EigenVector<T>::Flatten(*x2_t);
+
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    out.device(dev) = (-label * (x1 - x2) + margin).unaryExpr(ReLU<T>());
+    act.device(dev) = out.unaryExpr(Heaviside<T>());
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MarginRankLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_x1_t =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X1"));
+    auto* d_x2_t =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X2"));
+
+    auto* act_t = ctx.Input<framework::Tensor>("Activated");
+    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* label_t = ctx.Input<framework::Tensor>("Label");
+
+    auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
+    auto act = framework::EigenVector<T>::Flatten(*act_t);
+    auto label = framework::EigenVector<T>::Flatten(*label_t);
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    // compute d_x1
+    if (d_x1_t) {
+      d_x1_t->mutable_data<T>(ctx.GetPlace());
+      auto d_x1 = framework::EigenVector<T>::Flatten(*d_x1_t);
+      d_x1.device(dev) = -d_out * act * label;
+    }
+    // compute d_x2
+    if (d_x2_t) {
+      d_x2_t->mutable_data<T>(ctx.GetPlace());
+      auto d_x2 = framework::EigenVector<T>::Flatten(*d_x2_t);
+      d_x2.device(dev) = d_out * act * label;
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
similarity index 100%
rename from paddle/operators/math/CMakeLists.txt
rename to paddle/fluid/operators/math/CMakeLists.txt
diff --git a/paddle/fluid/operators/math/context_project.cc b/paddle/fluid/operators/math/context_project.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b73d976d1b3e6dcf99e5cc525263282b1253c600
--- /dev/null
+++ b/paddle/fluid/operators/math/context_project.cc
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/context_project.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template class ContextProjectFunctor<platform::CPUDeviceContext, float>;
+template class ContextProjectFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/context_project.cu b/paddle/fluid/operators/math/context_project.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bbd36a6e8f54833f15ee0c991228c10b7f74f272
--- /dev/null
+++ b/paddle/fluid/operators/math/context_project.cu
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/fluid/operators/math/context_project.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template class ContextProjectFunctor<platform::CUDADeviceContext, float>;
+template class ContextProjectFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/context_project.h b/paddle/fluid/operators/math/context_project.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fe593ec3af9d07a2cbafc69e8d3a52e2c43e76b
--- /dev/null
+++ b/paddle/fluid/operators/math/context_project.h
@@ -0,0 +1,306 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+/*
+ * \brief Context projection concatenates features in adjacent time-steps in
+ * a sequence. The i-th row of the output is the concatenation of
+ * context_length rows of the input. The context_length rows are the
+ * consecutive rows from the i+shift_start row.
+ * ContextProjectGradFunctor is the inverse process of ContextProjectFunctor.
+ *
+ * \param in            Input data.
+ * \param Shape         The shape of Input data:
+ *                        [mini-batch, input_hidden_size].
+ *
+ * \param padding_data  Padding data.
+ * \param Shape         The shape of Padding data:
+ *                        [up_pad + down_pad, input_hidden_size].
+ *
+ * \param col           Col data.
+ * \param Shape         The shape of Col data:
+ *                        [mini-batch, context_length * input_hidden_size].
+ *
+ * For a mini-batch of 2 variable lengths sentences, containing 3, and 1
+ * time-steps:
+ *
+ * Assumed input (X) is a [4, M, N] float LoDTensor, and X->lod()[0] = [0, 3,
+ * 4].
+ * Besides, for the sake of simplicity, we assume M=1 and N=2.
+ *
+ * X = [[a1, a2;
+ *       b1, b2;
+ *       c1, c2]
+ *      [d1, d2]]
+ *
+ * This is to say that input (X) has 4 words and the dimension of each word
+ * representation is 2.
+ *
+ * - Case1:
+ *   If context_start is -1 and padding_trainable is false, we use zero to pad
+ *   instead of learned weight to pad,
+ *   and the context_length is 3, the output (Out) is:
+ *
+ *   Out =[[0,  0,  a1, a2, b1, b2;
+ *          a1, a2, b1, b2, c1, c2;
+ *          b1, b2, c1, c2, 0,  0 ]
+ *          [0,  0, d1, d2, 0,  0 ]]
+ *
+ * - Case2:
+ *   If context_start is -1 and padding_trainable is true, we use learned weight
+ *   to pad,
+ *   and the context_length is 3, the output (Out) is:
+ *
+ *   Out = [[w1, w2, a1, a2, b1, b2;
+ *           a1, a2, b1, b2, c1, c2;
+ *           b1, b2, c1, c2, w3, w4]
+ *          [w1, w2, d1, d2, w3, w4]]
+ *
+ */
+
+template <typename DeviceContext, typename T>
+class ContextProjectFunctor {
+ public:
+  void operator()(const DeviceContext& context, const LoDTensor& in,
+                  const Tensor& padding_data, bool padding_trainable,
+                  const int context_start, const int context_length,
+                  const int context_stride, const int up_pad,
+                  const int down_pad, Tensor* col) {
+    auto lod_level_0 = in.lod()[0];
+
+    math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, float> im2col_ocf;
+
+    std::vector<int> dilation({1, 1});
+    std::vector<int> padding({up_pad, 0, down_pad, 0});
+    std::vector<int> stride({context_stride, 1});
+
+    int input_row_begin, input_row_end;
+    int sequence_height, sequence_width;
+    sequence_width = in.dims()[1];
+
+    for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+      input_row_begin = (context_start > 0)
+                            ? static_cast<int>(lod_level_0[i]) + context_start
+                            : static_cast<int>(lod_level_0[i]);
+      input_row_end = static_cast<int>(lod_level_0[i + 1]);
+
+      Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
+                                static_cast<int>(lod_level_0[i + 1]));
+
+      sequence_height = static_cast<int>(out_t.dims()[0]);
+
+      if (input_row_begin < input_row_end) {
+        Tensor in_t = in.Slice(input_row_begin, input_row_end);
+
+        std::vector<int64_t> output_shape(
+            {sequence_height, 1, 1, context_length,
+             sequence_width});  // output_height, output_width,
+        // input_channels, filter_height, filter_width
+        out_t.Resize(framework::make_ddim(output_shape));
+
+        std::vector<int64_t> input_shape(
+            {1, input_row_end - input_row_begin,
+             sequence_width});  // input_channels, input_height, input_width
+        in_t.Resize(framework::make_ddim(input_shape));
+        im2col_ocf(context, in_t, dilation, stride, padding, &out_t);
+        out_t.Resize({sequence_height, context_length * sequence_width});
+      }
+    }
+    if (padding_trainable) {
+      for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+        Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
+                                  static_cast<int>(lod_level_0[i + 1]));
+
+        sequence_height = static_cast<int>(out_t.dims()[0]);
+
+        // add up trainable data
+        out_t.Resize({sequence_height * context_length, sequence_width});
+
+        if (up_pad > 0) {  // add up pad
+          int padding_rows = std::min(
+              up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));
+
+          for (int k = 0; k < padding_rows; ++k) {
+            int padding_size =
+                k + context_length < up_pad ? context_length : up_pad - k;
+            Tensor out_t_sub = out_t.Slice(k * context_length,
+                                           k * context_length + padding_size);
+            Tensor w_sub = padding_data.Slice(k, k + padding_size);
+            framework::Copy(w_sub, context.GetPlace(), context, &out_t_sub);
+          }
+        }
+        if (down_pad > 0) {  // add down pad
+          int down_pad_begin_row =
+              std::max(0,
+                       (sequence_height - context_start - context_length) + 1) +
+              1;
+          int padding_begin = std::max(0, context_start - sequence_height);
+          int padding_size =
+              sequence_height - context_start >= context_length
+                  ? 1
+                  : context_length - (sequence_height - context_start);
+          if (context_start >= sequence_height) padding_size = context_length;
+          int padding_idx = padding_begin;
+          for (int t = 0; t + down_pad_begin_row <= sequence_height;
+               ++t, ++padding_size) {
+            if (context_start >= sequence_height) padding_size = context_length;
+            if (padding_size > context_length) {
+              padding_size = context_length;
+              padding_idx++;
+            }
+            if (padding_begin > 0 || sequence_height == context_start)
+              padding_idx = padding_begin + t;
+
+            Tensor out_t_sub = out_t.Slice(
+                (down_pad_begin_row + t) * context_length - padding_size,
+                (down_pad_begin_row + t) * context_length);
+            Tensor w_sub = padding_data.Slice(
+                up_pad + padding_idx, up_pad + padding_idx + padding_size);
+            framework::Copy(w_sub, context.GetPlace(), context, &out_t_sub);
+          }
+        }
+        out_t.Resize({sequence_height, context_length * sequence_width});
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ContextProjectGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const LoDTensor& in,
+                  bool padding_trainable, const int context_start,
+                  const int context_length, const int context_stride,
+                  const int up_pad, const int down_pad, bool pad_grad,
+                  bool input_grad, Tensor* padding_data, Tensor* col) {
+    auto lod_level_0 = in.lod()[0];
+
+    math::Col2ImFunctor<math::ColFormat::kOCF, DeviceContext, float> col2im_ocf;
+
+    std::vector<int> dilation({1, 1});
+    std::vector<int> padding({up_pad, 0, down_pad, 0});
+    std::vector<int> stride({context_stride, 1});
+
+    int input_row_begin, input_row_end;
+    int sequence_height, sequence_width;
+    sequence_width = in.dims()[1];
+
+    if (input_grad) {
+      for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+        input_row_begin = (context_start > 0)
+                              ? static_cast<int>(lod_level_0[i]) + context_start
+                              : static_cast<int>(lod_level_0[i]);
+        input_row_end = static_cast<int>(lod_level_0[i + 1]);
+
+        Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
+                                  static_cast<int>(lod_level_0[i + 1]));
+
+        sequence_height = static_cast<int>(out_t.dims()[0]);
+
+        if (input_row_begin < input_row_end) {
+          Tensor in_t = in.Slice(input_row_begin, input_row_end);
+
+          std::vector<int64_t> output_shape(
+              {sequence_height, 1, 1, context_length,
+               sequence_width});  // output_height, output_width,
+          // input_channels, filter_height, filter_width
+          out_t.Resize(framework::make_ddim(output_shape));
+
+          std::vector<int64_t> input_shape(
+              {1, input_row_end - input_row_begin,
+               sequence_width});  // input_channels, input_height, input_width
+          in_t.Resize(framework::make_ddim(input_shape));
+
+          col2im_ocf(context, out_t, dilation, stride, padding, &in_t);
+          out_t.Resize({sequence_height, context_length * sequence_width});
+        }
+      }
+    }
+    if (pad_grad) {
+      if (padding_trainable) {
+        for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+          Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
+                                    static_cast<int>(lod_level_0[i + 1]));
+
+          sequence_height = static_cast<int>(out_t.dims()[0]);
+          out_t.Resize({sequence_height * context_length, sequence_width});
+
+          if (up_pad > 0) {
+            int padding_rows = std::min(
+                up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));
+
+            for (int k = 0; k < padding_rows; ++k) {
+              int padding_size =
+                  k + context_length < up_pad ? context_length : up_pad - k;
+              Tensor out_t_sub = out_t.Slice(k * context_length,
+                                             k * context_length + padding_size);
+              Tensor w_sub = padding_data->Slice(k, k + padding_size);
+              axpy<DeviceContext, T>(context, w_sub.numel(), static_cast<T>(1),
+                                     out_t_sub.data<T>(), w_sub.data<T>());
+            }
+          }
+          if (down_pad > 0) {
+            int down_pad_begin_row =
+                std::max(
+                    0, (sequence_height - context_start - context_length) + 1) +
+                1;
+            int padding_begin = std::max(0, context_start - sequence_height);
+            int padding_size =
+                sequence_height - context_start >= context_length
+                    ? 1
+                    : context_length - (sequence_height - context_start);
+            if (context_start >= sequence_height) padding_size = context_length;
+            int padding_idx = padding_begin;
+            for (int t = 0; t + down_pad_begin_row <= sequence_height;
+                 ++t, ++padding_size) {
+              if (context_start >= sequence_height)
+                padding_size = context_length;
+              if (padding_size > context_length) {
+                padding_size = context_length;
+                padding_idx++;
+              }
+              if (padding_begin > 0 || sequence_height == context_start)
+                padding_idx = padding_begin + t;
+
+              Tensor out_t_sub = out_t.Slice(
+                  (down_pad_begin_row + t) * context_length - padding_size,
+                  (down_pad_begin_row + t) * context_length);
+              Tensor w_sub = padding_data->Slice(
+                  up_pad + padding_idx, up_pad + padding_idx + padding_size);
+              axpy<DeviceContext, T>(context, w_sub.numel(), static_cast<T>(1),
+                                     out_t_sub.data<T>(), w_sub.data<T>());
+            }
+          }
+          out_t.Resize({sequence_height, context_length * sequence_width});
+        }
+      }
+    }
+  }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cos_sim_functor.cc b/paddle/fluid/operators/math/cos_sim_functor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..701a9c23c0da3afbb643e9a821b7b74e69170710
--- /dev/null
+++ b/paddle/fluid/operators/math/cos_sim_functor.cc
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/cos_sim_functor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct CosSimDyFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx, const T* x_norm,
+                  const T* y_norm, const T* x, const T* y, const T* z,
+                  const T* dz, const size_t rows, const size_t cols,
+                  T* dy) const {
+    for (size_t row_id = 0; row_id < rows; ++row_id) {
+      auto xy_norm_prod = x_norm[row_id] * y_norm[0];
+      auto dz_data = dz[row_id];
+      auto z_data = z[row_id];
+      auto* x_data = x + cols * row_id;
+      auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+
+      auto y_norm_square = y_norm[0] * y_norm[0];
+      auto reciprocal_y_norm_square = 1 / y_norm_square;
+      for (size_t i = 0; i < cols; ++i) {
+        dy[i] += dz_data * (x_data[i] * reciprocal_xy_norm_prod -
+                            z_data * y[i] * reciprocal_y_norm_square);
+      }
+    }
+  }
+};
+
+template struct CosSimDyFunctor<platform::CPUDeviceContext, float>;
+template struct CosSimDyFunctor<platform::CPUDeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cos_sim_functor.cu b/paddle/fluid/operators/math/cos_sim_functor.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0323680870ad835afca5a896f80d3abde0aad11c
--- /dev/null
+++ b/paddle/fluid/operators/math/cos_sim_functor.cu
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/cos_sim_functor.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+__global__ void CosSimDyKernel(const T* x_norm, const T* y_norm, const T* x,
+                               const T* y, const T* z, const T* dz,
+                               const size_t rows, const size_t cols, T* dy) {
+  int grid_size = blockDim.x * gridDim.x;
+  T y_norm_data = y_norm[0];
+  for (int row_id = blockIdx.x * blockDim.x + threadIdx.x; row_id < rows;
+       row_id += grid_size) {
+    T xy_norm_prod = x_norm[row_id] * y_norm_data;
+    T dz_data = dz[row_id];
+    T z_data = z[row_id];
+    const T* x_data = x + cols * row_id;
+    T reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+
+    T y_norm_square = y_norm_data * y_norm_data;
+    T reciprocal_y_norm_square = 1 / y_norm_square;
+    for (size_t i = 0; i < cols; ++i) {
+      T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod -
+                             z_data * y[i] * reciprocal_y_norm_square);
+      platform::CudaAtomicAdd(dy + i, dy_data);
+    }
+  }
+}
+
+template <typename T>
+struct CosSimDyFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx, const T* x_norm,
+                  const T* y_norm, const T* x, const T* y, const T* z,
+                  const T* dz, const size_t rows, const size_t cols,
+                  T* dy) const {
+    const int block_size = 512;
+    dim3 threads(block_size, 1);
+    dim3 grid(1, (rows + block_size - 1) / block_size);
+    CosSimDyKernel<T><<<grid, threads, 0, ctx.stream()>>>(
+        x_norm, y_norm, x, y, z, dz, rows, cols, dy);
+  }
+};
+
+template struct CosSimDyFunctor<platform::CUDADeviceContext, float>;
+template struct CosSimDyFunctor<platform::CUDADeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cos_sim_functor.h b/paddle/fluid/operators/math/cos_sim_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..445d94f975f3448cc09c21be8e0a13d73d002382
--- /dev/null
+++ b/paddle/fluid/operators/math/cos_sim_functor.h
@@ -0,0 +1,166 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <math.h>
+#include <stdlib.h>
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, bool same_row>
+struct CosSimFunctor {
+  CosSimFunctor(const T* x, const T* y, T* x_norm, T* y_norm, T* z, int cols)
+      : x_norm_(x_norm),
+        y_norm_(y_norm),
+        x_(x),
+        y_(y),
+        z_(z),
+        cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    auto* x = x_ + cols_ * row_id;
+    T xx = 0, xy = 0, yy = 0;
+    if (same_row) {
+      auto* y = y_ + cols_ * row_id;
+      T tep_x, tep_y;
+      for (size_t i = 0; i < cols_; ++i) {
+        tep_x = x[i];
+        tep_y = y[i];
+        xx += tep_x * tep_x;
+        yy += tep_y * tep_y;
+        xy += tep_x * tep_y;
+      }
+      xx = sqrt(xx);
+      yy = sqrt(yy);
+      y_norm_[row_id] = yy;
+      x_norm_[row_id] = xx;
+      z_[row_id] = xy / (xx * yy);
+    } else {  // This can be wrote in a better way.
+      T tep_x, tep_y;
+      for (size_t i = 0; i < cols_; ++i) {
+        tep_x = x[i];
+        tep_y = y_[i];
+        xx += tep_x * tep_x;
+        yy += tep_y * tep_y;
+        xy += tep_x * tep_y;
+      }
+      xx = sqrt(xx);
+      yy = sqrt(yy);
+      if (row_id == 0) y_norm_[0] = yy;
+      x_norm_[row_id] = xx;
+      z_[row_id] = xy / (xx * yy);
+    }
+  }
+
+  T* x_norm_;
+  T* y_norm_;
+  const T* x_;
+  const T* y_;
+  T* z_;
+  const size_t cols_;
+};
+
+template <typename T>
+struct CosSimGradFunctor {
+  CosSimGradFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y,
+                    const T* z, const T* dz, T* dx, int cols)
+      : x_norm_(x_norm),
+        y_norm_(y_norm),
+        x_(x),
+        y_(y),
+        z_(z),
+        dz_(dz),
+        dx_(dx),
+        cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    auto x_norm_square = x_norm_[row_id] * x_norm_[row_id];
+    auto xy_norm_prod = x_norm_[row_id] * y_norm_[row_id];
+    auto dz = dz_[row_id];
+    auto z = z_[row_id];
+
+    auto* dx = dx_ + cols_ * row_id;
+    auto* x = x_ + cols_ * row_id;
+    auto* y = y_ + cols_ * row_id;
+
+    auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+    auto reciprocal_x_norm_square = 1 / x_norm_square;
+    for (size_t i = 0; i < cols_; ++i) {
+      dx[i] = dz * (y[i] * reciprocal_xy_norm_prod -
+                    z * x[i] * reciprocal_x_norm_square);
+    }
+  }
+
+  const T* x_norm_;
+  const T* y_norm_;
+  const T* x_;
+  const T* y_;
+  const T* z_;
+  const T* dz_;
+  T* dx_;
+  const size_t cols_;
+};
+
+template <typename T>
+struct CosSimDxFunctor {
+  CosSimDxFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y,
+                  const T* z, const T* dz, T* dx, int cols)
+      : x_norm_(x_norm),
+        y_norm_(y_norm),
+        x_(x),
+        y_(y),
+        z_(z),
+        dz_(dz),
+        dx_(dx),
+        cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    auto xy_norm_prod = x_norm_[row_id] * y_norm_[0];
+    auto dz = dz_[row_id];
+    auto z = z_[row_id];
+    auto* x = x_ + cols_ * row_id;
+    auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+    auto x_norm_square = x_norm_[row_id] * x_norm_[row_id];
+    auto* dx = dx_ + cols_ * row_id;
+    auto reciprocal_x_norm_square = 1 / x_norm_square;
+
+    for (size_t i = 0; i < cols_; ++i) {
+      dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod -
+                    z * x[i] * reciprocal_x_norm_square);
+    }
+  }
+  const T* x_norm_;
+  const T* y_norm_;
+  const T* x_;
+  const T* y_;
+  const T* z_;
+  const T* dz_;
+  T* dx_;
+  const size_t cols_;
+};
+
+template <typename DeviceContext, typename T>
+struct CosSimDyFunctor {
+  void operator()(const DeviceContext& ctx, const T* x_norm, const T* y_norm,
+                  const T* x, const T* y, const T* z, const T* dz,
+                  const size_t rows, const size_t cols, T* dy) const;
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc
new file mode 100644
index 0000000000000000000000000000000000000000..76abd03ff8b75e595461f41301c41ffe57d78686
--- /dev/null
+++ b/paddle/fluid/operators/math/cross_entropy.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/cross_entropy.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out,
+                  const framework::Tensor* prob,
+                  const framework::Tensor* labels, const bool softLabel) {
+    const int batch_size = prob->dims()[0];
+    if (softLabel) {
+      auto in = EigenMatrix<T>::From(*prob);
+      auto lbl = EigenMatrix<T>::From(*labels);
+      auto loss = EigenMatrix<T>::From(*out);
+
+      loss.device(*ctx.eigen_device()) =
+          -((lbl * in.log().unaryExpr(math::TolerableValue<T>()))
+                .sum(Eigen::DSizes<int, 1>(1))
+                .reshape(Eigen::DSizes<int, 2>(batch_size, 1)));
+    } else {
+      const int class_num = prob->dims()[1];
+      const T* prob_data = prob->data<T>();
+      T* loss_data = out->data<T>();
+
+      const int64_t* label_data = labels->data<int64_t>();
+      for (int i = 0; i < batch_size; ++i) {
+        int index = i * class_num + label_data[i];
+        loss_data[i] = -math::TolerableValue<T>()(std::log(prob_data[index]));
+      }
+    }
+  }
+};
+
+template class CrossEntropyFunctor<platform::CPUDeviceContext, float>;
+template class CrossEntropyFunctor<platform::CPUDeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
new file mode 100644
index 0000000000000000000000000000000000000000..39222c484c2fe847aec70b65d3d01745b8eea336
--- /dev/null
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -0,0 +1,131 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/cross_entropy.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+namespace {
+template <typename T>
+__global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
+                                   const int N, const int D) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
+    Y[i] = -math::TolerableValue<T>()(log(X[i * D + label[i]]));
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ T sum_single_warp(T val) {
+  val += __shfl_down(val, 16);
+  val += __shfl_down(val, 8);
+  val += __shfl_down(val, 4);
+  val += __shfl_down(val, 2);
+  val += __shfl_down(val, 1);
+  return val;
+}
+
+// CUDA do not support dynamic arrary in template
+// https://stackoverflow.com/questions/20497209
+template <typename T>
+struct SharedMemory {
+  // Ensure that we won't compile any un-specialized types
+  __device__ T* GetPointer() { return NULL; }
+};
+
+template <>
+struct SharedMemory<float> {
+  __device__ float* GetPointer() {
+    extern __shared__ float s_float[];
+    return s_float;
+  }
+};
+
+template <>
+struct SharedMemory<double> {
+  __device__ double* GetPointer() {
+    extern __shared__ double s_double[];
+    return s_double;
+  }
+};
+
+template <typename T>
+__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
+                                       const int class_num) {
+  int tid = threadIdx.x;
+  SharedMemory<T> d_sum_shared;
+  T* d_sum = d_sum_shared.GetPointer();
+  d_sum[tid] = 0;
+
+  int cur_idx = tid;
+  int next_idx = blockIdx.x * class_num + tid;
+  while (cur_idx < class_num) {
+    d_sum[tid] +=
+        math::TolerableValue<T>()(std::log(X[next_idx])) * label[next_idx];
+    next_idx += blockDim.x;
+    cur_idx += blockDim.x;
+  }
+  __syncthreads();
+
+  for (unsigned int stride = blockDim.x >> 1; stride >= 32; stride >>= 1) {
+    if (tid < stride) d_sum[tid] += d_sum[tid + stride];
+    __syncthreads();
+  }
+
+  T val = d_sum[tid];
+  val = sum_single_warp<T>(val);
+  if (tid == 0) Y[blockIdx.x] = -val;
+}
+}  // namespace
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  framework::Tensor* out, const framework::Tensor* prob,
+                  const framework::Tensor* labels, bool softLabel) {
+    const T* prob_data = prob->data<T>();
+    T* loss_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int batch_size = prob->dims()[0];
+    int class_num = prob->dims()[1];
+
+    if (softLabel) {
+      const T* label_data = labels->data<T>();
+      int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num)));
+
+      SoftCrossEntropyKernel<T><<<
+          batch_size, block, block * sizeof(T),
+          reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+          loss_data, prob_data, label_data, class_num);
+    } else {
+      const int64_t* label_data = labels->data<int64_t>();
+      int block = 512;
+      int grid = (batch_size + block - 1) / block;
+      CrossEntropyKernel<T><<<grid, block, 0, ctx.stream()>>>(
+          loss_data, prob_data, label_data, batch_size, class_num);
+    }
+  }
+};
+
+template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
+template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fe216a805383ae0d7e8d008af2838652fcf87c6
--- /dev/null
+++ b/paddle/fluid/operators/math/cross_entropy.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct TolerableValue {
+  HOSTDEVICE T operator()(const T& x) const {
+    PADDLE_ASSERT(std::is_floating_point<T>::value);
+    const T kApproInf = 1e20;
+
+    if (x == INFINITY) return kApproInf;
+    if (x == -INFINITY) return -kApproInf;
+    return x;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CrossEntropyFunctor {
+ public:
+  void operator()(const DeviceContext& context, framework::Tensor* out,
+                  const framework::Tensor* prob,
+                  const framework::Tensor* labels, const bool softLabel);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7b75e593071eaeb72bcfc687b6ff22b7cf4f143f
--- /dev/null
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
@@ -0,0 +1,311 @@
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/depthwise_conv.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// A Cuda kernel to compute the depthwise convolution forward pass
+// in NCHW format.
+template <typename T>
+__global__ void KernelDepthwiseConv(
+    const int nthreads, const T* const input_data, const T* const filter_data,
+    const int batch_size, const int output_channels, const int output_height,
+    const int output_width, const int input_channels, const int input_height,
+    const int input_width, const int filter_multiplier, const int filter_height,
+    const int filter_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width, T* const output_data) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if (index < nthreads) {
+    const int batch = index / output_channels / output_height / output_width;
+    const int c_out = (index / output_height / output_width) % output_channels;
+    const int h_out = (index / output_width) % output_height;
+    const int w_out = index % output_width;
+
+    const int c_in = c_out / filter_multiplier;
+    const T* weight = filter_data + c_out * filter_height * filter_width;
+    T value = 0;
+    const int h_in_start = -padding_height + h_out * stride_height;
+    const int w_in_start = -padding_width + w_out * stride_width;
+    const int h_in_end = h_in_start + filter_height;
+    const int w_in_end = w_in_start + filter_width;
+
+    const int in_offset =
+        ((batch * input_channels + c_in) * input_height) * input_width;
+
+    const int h_end = h_in_end < input_height ? h_in_end : input_height;
+    const int w_end = w_in_end < input_width ? w_in_end : input_width;
+    const int h_start = h_in_start > 0 ? h_in_start : 0;
+    const int w_start = w_in_start > 0 ? w_in_start : 0;
+
+    for (int h_in = h_start; h_in < h_end; h_in++) {
+      for (int w_in = w_start; w_in < w_end; w_in++) {
+        const int offset = in_offset + h_in * input_width + w_in;
+        value +=
+            weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] *
+            input_data[offset];
+      }
+    }
+    output_data[index] = value;
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
+template <typename T>
+__global__ void KernelDepthwiseConvInputGrad(
+    const int nthreads, const T* const output_grad_data,
+    const T* const filter_data, const int batch_size, const int output_channels,
+    const int output_height, const int output_width, const int input_channels,
+    const int input_height, const int input_width, const int filter_multiplier,
+    const int filter_height, const int filter_width, const int stride_height,
+    const int stride_width, const int padding_height, const int padding_width,
+    T* const input_grad_data) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int batch = index / input_channels / input_height / input_width;
+    const int c_in = (index / input_height / input_width) % input_channels;
+    const int h_in = (index / input_width) % input_height;
+    const int w_in = index % input_width;
+
+    const int c_out_start = c_in * filter_multiplier;
+
+    int h_out_start =
+        (h_in - filter_height + padding_height + stride_height) / stride_height;
+    h_out_start = 0 > h_out_start ? 0 : h_out_start;
+
+    int h_out_end = (h_in + padding_height) / stride_height;
+    h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end;
+
+    int w_out_start =
+        (w_in - filter_width + padding_width + stride_width) / stride_width;
+    w_out_start = 0 > w_out_start ? 0 : w_out_start;
+
+    int w_out_end = (w_in + padding_width) / stride_width;
+    w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end;
+
+    T value = 0;
+
+    for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier;
+         c_out++) {
+      for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
+        const int filter_h = h_in + padding_height - h_out * stride_height;
+        for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
+          const int filter_w = w_in + padding_width - w_out * stride_width;
+          const int filter_offset = c_out * filter_height * filter_width +
+                                    filter_h * filter_width + filter_w;
+          const int output_grad_offset =
+              ((batch * output_channels + c_out) * output_height + h_out) *
+                  output_width +
+              w_out;
+          value +=
+              output_grad_data[output_grad_offset] * filter_data[filter_offset];
+        }
+      }
+    }
+    input_grad_data[index] += value;
+  }
+}
+
+// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
+template <typename T>
+__global__ void KernelDepthwiseConvFilterGrad(
+    const int nthreads, const T* const output_grad_data,
+    const T* const input_data, const int num, const int output_channels,
+    const int output_height, const int output_width, const int input_channels,
+    const int input_height, const int input_width, const int filter_multiplier,
+    const int filter_height, const int filter_width, const int stride_height,
+    const int stride_width, const int padding_height, const int padding_width,
+    T* const filter_grad_data) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int w_out = index % output_width;
+    const int h_out = (index / output_width) % output_height;
+    const int c_out = (index / output_width / output_height) % output_channels;
+    const int batch = (index / output_width / output_height / output_channels);
+    const int c_in = c_out / filter_multiplier;
+    const int h_in_start = -padding_height + h_out * stride_height;
+    const int w_in_start = -padding_width + w_out * stride_width;
+    const int h_in_end =
+        -padding_height + h_out * stride_height + filter_height;
+    const int w_in_end = -padding_width + w_out * stride_width + filter_width;
+    const int in_offset =
+        (batch * input_channels + c_in) * input_height * input_width;
+
+    T* addr_offset = filter_grad_data + c_out * filter_height * filter_width;
+    const int h_end = h_in_end < input_height ? h_in_end : input_height;
+    const int w_end = w_in_end < input_width ? w_in_end : input_width;
+    const int h_start = h_in_start > 0 ? h_in_start : 0;
+    const int w_start = w_in_start > 0 ? w_in_start : 0;
+
+    for (int h_in = h_start; h_in < h_end; h_in++) {
+      for (int w_in = w_start; w_in < w_end; w_in++) {
+        const int offset = in_offset + h_in * input_width + w_in;
+        const T diff_temp = output_grad_data[index] * input_data[offset];
+        T* addr = addr_offset + (h_in - h_in_start) * filter_width +
+                  (w_in - w_in_start);
+        paddle::platform::CudaAtomicAdd(addr, diff_temp);
+      }
+    }
+  }
+}
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <class T>
+class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const int ksize_height = filter.dims()[2];
+    const int ksize_width = filter.dims()[3];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* filter_data = filter.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelDepthwiseConv<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, filter_data, batch_size, output_channels,
+        output_height, output_width, input_channels, input_height, input_width,
+        output_channels / input_channels, ksize_height, ksize_width,
+        stride_height, stride_width, padding_height, padding_width,
+        output_data);
+  }
+};
+
+template <typename T>
+class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& filter,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output_grad.dims()[1];
+    const int output_height = output_grad.dims()[2];
+    const int output_width = output_grad.dims()[3];
+    const int ksize_height = filter.dims()[2];
+    const int ksize_width = filter.dims()[3];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* filter_data = filter.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * input_channels * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelDepthwiseConvInputGrad<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, output_grad_data, filter_data, batch_size, output_channels,
+        output_height, output_width, input_channels, input_height, input_width,
+        output_channels / input_channels, ksize_height, ksize_width,
+        stride_height, stride_width, padding_height, padding_width,
+        input_grad_data);
+  }
+};
+
+template <typename T>
+class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* filter_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output_grad.dims()[1];
+    const int output_height = output_grad.dims()[2];
+    const int output_width = output_grad.dims()[3];
+    const int ksize_height = filter_grad->dims()[2];
+    const int ksize_width = filter_grad->dims()[3];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_height * output_width;
+
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelDepthwiseConvFilterGrad<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, output_grad_data, input_data, batch_size, output_channels,
+        output_height, output_width, input_channels, input_height, input_width,
+        output_channels / input_channels, ksize_height, ksize_width,
+        stride_height, stride_width, padding_height, padding_width,
+        filter_grad_data);
+  }
+};
+
+template class DepthwiseConvFunctor<platform::CUDADeviceContext, float>;
+template class DepthwiseConvFunctor<platform::CUDADeviceContext, double>;
+
+template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
+                                             float>;
+template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
+                                             double>;
+
+template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
+                                              float>;
+template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
+                                              double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3081e7a0deb4afc47d826753ecb2556aa6f4522
--- /dev/null
+++ b/paddle/fluid/operators/math/depthwise_conv.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * \brief Compute the depthwise convolution which include
+ * forward process and backpropagation process
+ */
+template <typename DeviceContext, typename T>
+class DepthwiseConvFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* output);
+};
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvInputGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& filter,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
+};
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvFilterGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* filter_grad);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/detail/CMakeLists.txt b/paddle/fluid/operators/math/detail/CMakeLists.txt
similarity index 100%
rename from paddle/operators/math/detail/CMakeLists.txt
rename to paddle/fluid/operators/math/detail/CMakeLists.txt
diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..3af7ba790c489b2fc34b3cb6d56849ce789d2430
--- /dev/null
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
@@ -0,0 +1,191 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <math.h>
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+
+enum ActivationType {
+  kSigmoid,
+  kReLU,
+  kTanh,
+  kIdentity,
+};
+
+inline ActivationType GetActivationType(const std::string &type) {
+  if (type == "sigmoid") {
+    return ActivationType::kSigmoid;
+  } else if (type == "relu") {
+    return ActivationType::kReLU;
+  } else if (type == "tanh") {
+    return ActivationType::kTanh;
+  } else if (type == "identity" || type == "") {
+    return ActivationType::kIdentity;
+  }
+  PADDLE_THROW("Not support type %s.", type);
+}
+
+namespace forward {
+
+template <typename T>
+DEVICE T Identity(const T a) {
+  return a;
+}
+
+template <typename T>
+DEVICE T Relu(const T a) {
+  return a > static_cast<T>(0.0) ? a : static_cast<T>(0.0);
+}
+
+template <typename T>
+DEVICE T Sigmoid(const T a) {
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  T tmp = (a < min) ? min : ((a > max) ? max : a);
+  return static_cast<T>(1.0) / (static_cast<T>(1.0) + exp(-tmp));
+}
+
+template <typename T>
+DEVICE T Tanh(const T a) {
+  T tmp = -2.0 * a;
+  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+  return (2.0 / (1.0 + exp(tmp))) - 1.0;
+}
+
+}  // namespace forward
+
+namespace backward {
+
+template <typename T>
+DEVICE T Identity(const T a, const T b) {
+  return a;
+}
+
+template <typename T>
+DEVICE T Relu(const T a, const T b) {
+  return a * (b > 0.0 ? 1.0 : 0.0);
+}
+
+template <typename T>
+DEVICE T Sigmoid(const T a, const T b) {
+  return a * b * (1.0 - b);
+}
+
+template <typename T>
+DEVICE T Tanh(const T a, const T b) {
+  return a * (1.0 - b * b);
+}
+
+}  // namespace backward
+
+template <typename T>
+struct Active {
+  typedef T (*Act)(T);
+  typedef T (*ActGrad)(T, T);
+};
+
+static DEVICE Active<float>::Act kActFloat[] = {
+    &forward::Sigmoid<float>, &forward::Relu<float>, &forward::Tanh<float>,
+    &forward::Identity<float>};
+
+static DEVICE Active<float>::ActGrad kActGradFloat[] = {
+    &backward::Sigmoid<float>, &backward::Relu<float>, &backward::Tanh<float>,
+    &backward::Identity<float>};
+
+static DEVICE Active<double>::Act kActDouble[] = {
+    &forward::Sigmoid<double>, &forward::Relu<double>, &forward::Tanh<double>,
+    &forward::Identity<double>};
+
+static DEVICE Active<double>::ActGrad kActGradDouble[] = {
+    &backward::Sigmoid<double>, &backward::Relu<double>,
+    &backward::Tanh<double>, &backward::Identity<double>};
+
+namespace forward {
+inline DEVICE float activation(float a, int index) {
+  return kActFloat[index](a);
+}
+
+inline DEVICE double activation(double a, int index) {
+  return kActDouble[index](a);
+}
+
+}  // namespace forward
+
+namespace backward {
+inline DEVICE float activation(float a, float b, int index) {
+  return kActGradFloat[index](a, b);
+}
+
+inline DEVICE double activation(double a, double b, int index) {
+  return kActGradDouble[index](a, b);
+}
+}  // namespace backward
+
+#ifdef __AVX__
+namespace forward {
+namespace avx {
+__m256 Relu(const __m256 a);
+__m256 Sigmoid(const __m256 a);
+__m256 Tanh(const __m256 a);
+__m256 Identity(const __m256 a);
+}  // namespace avx
+}  // namespace forward
+
+namespace backward {
+namespace avx {
+__m256 Relu(const __m256 a, const __m256 b);
+__m256 Sigmoid(const __m256 a, const __m256 b);
+__m256 Tanh(const __m256 a, const __m256 b);
+__m256 Identity(const __m256 a, const __m256 b);
+}  // namespace avx
+}  // namespace backward
+
+static Active<__m256>::Act kActAvx[] = {
+    &forward::avx::Sigmoid, &forward::avx::Relu, &forward::avx::Tanh,
+    &forward::avx::Identity};
+
+static Active<__m256>::ActGrad kActGradAvx[] = {
+    &backward::avx::Sigmoid, &backward::avx::Relu, &backward::avx::Tanh,
+    &backward::avx::Identity};
+
+namespace forward {
+inline __m256 activation(__m256 a, int index) { return kActAvx[index](a); }
+}  // namespace forward
+
+namespace backward {
+inline __m256 activation(__m256 a, __m256 b, int index) {
+  return kActGradAvx[index](a, b);
+}
+}  // namespace backward
+
+#endif
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/detail/avx_functions.cc b/paddle/fluid/operators/math/detail/avx_functions.cc
new file mode 100644
index 0000000000000000000000000000000000000000..838cd30e3d503ddb4734f1114e741fd40b1939c0
--- /dev/null
+++ b/paddle/fluid/operators/math/detail/avx_functions.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef __AVX__
+
+#include <immintrin.h>
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+// TODO(qingqing) refine this dependence
+#include "paddle/cuda/src/avx_mathfun.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+__m256 Exp(__m256 a) { return exp256_ps(a); }
+
+namespace forward {
+namespace avx {
+__m256 Relu(const __m256 a) {
+  __m256 tmp = _mm256_set1_ps(0.0f);
+  return _mm256_max_ps(a, tmp);
+}
+
+__m256 Sigmoid(const __m256 a) {
+  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
+  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
+  __m256 tmp = _mm256_max_ps(a, min);
+  tmp = _mm256_min_ps(tmp, max);
+  tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
+  tmp = Exp(tmp);
+  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
+  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
+  return tmp;
+}
+
+__m256 Tanh(const __m256 a) {
+  __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
+  __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
+  tmp = _mm256_min_ps(tmp, max);
+  tmp = Exp(tmp);
+  return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f),
+                                     _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)),
+                       _mm256_set1_ps(1.0f));
+}
+
+__m256 Identity(const __m256 a) { return a; }
+
+}  // namespace avx
+}  // namespace forward
+
+namespace backward {
+namespace avx {
+__m256 Relu(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(
+      a, _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
+                       _mm256_set1_ps(1.0f)));
+}
+
+__m256 Sigmoid(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(_mm256_mul_ps(a, b),
+                       _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
+}
+
+__m256 Tanh(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(
+      a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
+}
+
+__m256 Identity(const __m256 a, const __m256 b) { return a; }
+}  // namespace avx
+}  // namespace backward
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..75c5c8eb29a34047a22779edcc0fc2f5fbcbab6f
--- /dev/null
+++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
@@ -0,0 +1,426 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/gru_compute.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+#ifndef __NVCC__
+
+template <class OpResetOutput, typename T>
+void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
+                                       T *gate_value, T *reset_output_value,
+                                       T *prev_output_value, int frame_size,
+                                       ActivationType active_gate) {
+  T r_value_update_gate;
+  T r_value_reset_gate;
+  T r_value_reset_output;
+  T r_prev_out = 0;
+  T *update_gate = gate_value;
+  T *reset_gate = gate_value + frame_size;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_reset_gate = reset_gate[i];
+    if (prev_output_value) {
+      r_prev_out = prev_output_value[i];
+    }
+
+    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                    r_value_reset_output, active_gate);
+
+    update_gate[i] = r_value_update_gate;
+    reset_gate[i] = r_value_reset_gate;
+    reset_output_value[i] = r_value_reset_output;
+  }
+}
+
+template <class OpFinalOutput, typename T>
+void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
+                                       T *gate_value, T *prev_output_value,
+                                       T *output_value, int frame_size,
+                                       ActivationType active_node) {
+  T r_value_update_gate;
+  T r_value_frame_state;
+  T r_prev_out = 0;
+  T r_output;
+  T *update_gate = gate_value;
+  T *frame_state = gate_value + frame_size * 2;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_frame_state = frame_state[i];
+    if (prev_output_value) {
+      r_prev_out = prev_output_value[i];
+    }
+
+    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                    r_output, active_node);
+
+    frame_state[i] = r_value_frame_state;
+    output_value[i] = r_output;
+  }
+}
+
+template <class OpResetOutput, typename T>
+void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
+                                     T *gate_value, T *reset_output_value,
+                                     T *prev_output_value, int frame_size,
+                                     ActivationType active_gate) {
+#ifdef __AVX__
+  __m256 r_value_update_gate;
+  __m256 r_value_reset_gate;
+  __m256 r_value_reset_output;
+  __m256 r_prev_out = _mm256_set1_ps(0.0f);
+  __m256 *update_gate = (__m256 *)gate_value;
+  __m256 *reset_gate = (__m256 *)(gate_value + frame_size);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_reset_gate = reset_gate[i];
+    if (prev_output_value) {
+      r_prev_out = ((__m256 *)prev_output_value)[i];
+    }
+
+    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                    r_value_reset_output, active_gate);
+
+    update_gate[i] = r_value_update_gate;
+    reset_gate[i] = r_value_reset_gate;
+    ((__m256 *)reset_output_value)[i] = r_value_reset_output;
+  }
+#endif
+}
+
+template <class OpFinalOutput, typename T>
+void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
+                                     T *gate_value, T *prev_output_value,
+                                     T *output_value, int frame_size,
+                                     ActivationType active_node) {
+#ifdef __AVX__
+  __m256 r_value_update_gate;
+  __m256 r_value_frame_state;
+  __m256 r_prev_out = _mm256_set1_ps(0.0f);
+  __m256 r_output;
+  __m256 *update_gate = (__m256 *)gate_value;
+  __m256 *frame_state = (__m256 *)(gate_value + frame_size * 2);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_frame_state = frame_state[i];
+    if (prev_output_value) {
+      r_prev_out = ((__m256 *)prev_output_value)[i];
+    }
+
+    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                    r_output, active_node);
+
+    frame_state[i] = r_value_frame_state;
+    ((__m256 *)output_value)[i] = r_output;
+  }
+#endif
+}
+
+template <class OpResetOutput, typename T>
+inline void forward_reset_output(OpResetOutput op_reset_output,
+                                 GRUMetaValue<T> value, int frame_size,
+                                 int batch_size, ActivationType active_gate) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_forward_reset_output(
+          op_reset_output, value.gate_value, value.reset_output_value,
+          value.prev_out_value, frame_size, active_gate);
+    } else {
+      hl_naive_gru_forward_reset_output(
+          op_reset_output, value.gate_value, value.reset_output_value,
+          value.prev_out_value, frame_size, active_gate);
+    }
+
+    value.gate_value += frame_size * 3;
+    value.reset_output_value += frame_size;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
+    }
+  }
+}
+
+template <class OpFinalOutput, typename T>
+inline void forward_final_output(OpFinalOutput op_final_output,
+                                 GRUMetaValue<T> value, int frame_size,
+                                 int batch_size, ActivationType active_node) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
+                                      value.prev_out_value, value.output_value,
+                                      frame_size, active_node);
+    } else {
+      hl_naive_gru_forward_final_output(
+          op_final_output, value.gate_value, value.prev_out_value,
+          value.output_value, frame_size, active_node);
+    }
+
+    value.gate_value += frame_size * 3;
+    value.output_value += frame_size;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
+    }
+  }
+}
+
+template <class OpStateGrad, typename T>
+void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
+                                      T *gate_grad, T *prev_out_value,
+                                      T *prev_out_grad, T *output_grad,
+                                      int frame_size,
+                                      ActivationType active_node) {
+  T r_update_gate_value;
+  T r_update_gate_grad;
+  T r_frame_state_value;
+  T r_frame_state_grad;
+  T r_out_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T *update_gate_value = gate_value;
+  T *update_gate_grad = gate_grad;
+  T *frame_state_value = gate_value + frame_size * 2;
+  T *frame_state_grad = gate_grad + frame_size * 2;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_frame_state_value = frame_state_value[i];
+    r_out_grad = output_grad[i];
+    if (prev_out_value) {
+      r_prev_out_value = prev_out_value[i];
+    }
+    if (prev_out_grad) {
+      r_prev_out_grad = prev_out_grad[i];
+    }
+
+    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                  r_out_grad, active_node);
+
+    update_gate_grad[i] = r_update_gate_grad;
+    frame_state_grad[i] = r_frame_state_grad;
+    if (prev_out_grad) {
+      prev_out_grad[i] = r_prev_out_grad;
+    }
+  }
+}
+
+template <class OpResetGrad, typename T>
+void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
+                                      T *gate_grad, T *prev_out_value,
+                                      T *prev_out_grad, T *reset_output_grad,
+                                      int frame_size,
+                                      ActivationType active_gate) {
+  T r_update_gate_value;
+  T r_update_gate_grad;
+  T r_reset_gate_value;
+  T r_reset_gate_grad;
+  T r_reset_output_grad = 0;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T *update_gate_value = gate_value;
+  T *update_gate_grad = gate_grad;
+  T *reset_gate_value = gate_value + frame_size;
+  T *reset_gate_grad = gate_grad + frame_size;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_update_gate_grad = update_gate_grad[i];
+    r_reset_gate_value = reset_gate_value[i];
+
+    if (prev_out_value && prev_out_grad) {
+      r_reset_output_grad = reset_output_grad[i];
+    }
+    if (prev_out_value) {
+      r_prev_out_value = prev_out_value[i];
+    }
+    if (prev_out_grad) {
+      r_prev_out_grad = prev_out_grad[i];
+    }
+
+    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                  r_reset_output_grad, active_gate);
+
+    update_gate_grad[i] = r_update_gate_grad;
+    reset_gate_grad[i] = r_reset_gate_grad;
+    if (prev_out_grad) {
+      prev_out_grad[i] = r_prev_out_grad;
+    }
+  }
+}
+
+template <class OpStateGrad, typename T>
+void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
+                                    T *gate_grad, T *prev_out_value,
+                                    T *prev_out_grad, T *output_grad,
+                                    int frame_size,
+                                    ActivationType active_node) {
+#ifdef __AVX__
+  __m256 r_update_gate_value;
+  __m256 r_update_gate_grad;
+  __m256 r_frame_state_value;
+  __m256 r_frame_state_grad;
+  __m256 r_out_grad;
+  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
+  __m256 *update_gate_value = (__m256 *)gate_value;
+  __m256 *update_gate_grad = (__m256 *)gate_grad;
+  __m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2);
+  __m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_frame_state_value = frame_state_value[i];
+    r_out_grad = ((__m256 *)output_grad)[i];
+    if (prev_out_value) {
+      r_prev_out_value = ((__m256 *)prev_out_value)[i];
+    }
+    if (prev_out_grad) {
+      r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
+    }
+
+    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                  r_out_grad, active_node);
+
+    update_gate_grad[i] = r_update_gate_grad;
+    frame_state_grad[i] = r_frame_state_grad;
+    if (prev_out_grad) {
+      ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
+    }
+  }
+#endif
+}
+
+template <class OpResetGrad, typename T>
+void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
+                                    T *gate_grad, T *prev_out_value,
+                                    T *prev_out_grad, T *reset_output_grad,
+                                    int frame_size,
+                                    ActivationType active_gate) {
+#ifdef __AVX__
+  __m256 r_update_gate_value;
+  __m256 r_update_gate_grad;
+  __m256 r_reset_gate_value;
+  __m256 r_reset_gate_grad;
+  __m256 r_reset_output_grad = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
+  __m256 *update_gate_value = (__m256 *)gate_value;
+  __m256 *update_gate_grad = (__m256 *)gate_grad;
+  __m256 *reset_gate_value = (__m256 *)(gate_value + frame_size);
+  __m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_update_gate_grad = update_gate_grad[i];
+    r_reset_gate_value = reset_gate_value[i];
+
+    if (prev_out_value && prev_out_grad) {
+      r_reset_output_grad = ((__m256 *)reset_output_grad)[i];
+    }
+    if (prev_out_value) {
+      r_prev_out_value = ((__m256 *)prev_out_value)[i];
+    }
+    if (prev_out_grad) {
+      r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
+    }
+
+    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                  r_reset_output_grad, active_gate);
+
+    update_gate_grad[i] = r_update_gate_grad;
+    reset_gate_grad[i] = r_reset_gate_grad;
+    if (prev_out_grad) {
+      ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
+    }
+  }
+#endif
+}
+
+template <class OpStateGrad, typename T>
+inline void backward_state_grad(OpStateGrad op_state_grad,
+                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
+                                int frame_size, int batch_size,
+                                ActivationType active_node) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_backward_state_grad(
+          op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.output_grad, frame_size, active_node);
+    } else {
+      hl_naive_gru_backward_state_grad(
+          op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.output_grad, frame_size, active_node);
+    }
+
+    value.gate_value += frame_size * 3;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
+    }
+
+    grad.gate_grad += frame_size * 3;
+    grad.output_grad += frame_size;
+    if (grad.prev_out_grad) {
+      grad.prev_out_grad += frame_size;
+    }
+  }
+}
+
+template <class OpResetGrad, typename T>
+inline void backward_reset_grad(OpResetGrad op_reset_grad,
+                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
+                                int frame_size, int batch_size,
+                                ActivationType active_gate) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_backward_reset_grad(
+          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
+    } else {
+      hl_naive_gru_backward_reset_grad(
+          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
+    }
+
+    value.gate_value += frame_size * 3;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
+    }
+
+    grad.gate_grad += frame_size * 3;
+    grad.reset_output_grad += frame_size;
+    if (grad.prev_out_grad) {
+      grad.prev_out_grad += frame_size;
+    }
+  }
+}
+
+#endif
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbf69d4a85883a68f137945ed8978acb9108b77b
--- /dev/null
+++ b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
@@ -0,0 +1,201 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/gru_compute.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <class OpResetOutput, bool is_batch, typename T>
+__global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
+                                        T *gate_value, T *reset_output_value,
+                                        T *prev_output_value, int frame_size,
+                                        int batch_size,
+                                        ActivationType active_gate) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    reset_output_value += batch_idx * frame_size;
+  }
+
+  T r_prev_out = 0;
+  T r_value_reset_output;
+  T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
+  T r_value_reset_gate = gate_value[frame_idx + frame_size * 1];
+
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    r_prev_out = prev_output_value[frame_idx];
+  }
+
+  op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                  r_value_reset_output, active_gate);
+
+  gate_value[frame_idx + frame_size * 0] = r_value_update_gate;
+  gate_value[frame_idx + frame_size * 1] = r_value_reset_gate;
+  reset_output_value[frame_idx] = r_value_reset_output;
+}
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <class OpFinalOutput, bool is_batch, typename T>
+__global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
+                                        T *gate_value, T *prev_output_value,
+                                        T *output_value, int frame_size,
+                                        int batch_size,
+                                        ActivationType active_node) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    output_value += batch_idx * frame_size;
+  }
+
+  T r_output;
+  T r_prev_out = 0;
+  T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
+  T r_value_frame_state = gate_value[frame_idx + frame_size * 2];
+
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    r_prev_out = prev_output_value[frame_idx];
+  }
+
+  op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                  r_output, active_node);
+
+  gate_value[frame_idx + frame_size * 2] = r_value_frame_state;
+  output_value[frame_idx] = r_output;
+}
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <class OpStateGrad, bool is_batch, typename T>
+__global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
+                                       T *gate_grad, T *prev_out_value,
+                                       T *prev_out_grad, T *output_grad,
+                                       int frame_size, int batch_size,
+                                       ActivationType active_node) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    gate_grad += batch_idx * 3 * frame_size;
+    output_grad += batch_idx * frame_size;
+  }
+
+  T r_update_gate_grad;
+  T r_frame_state_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T r_frame_state_value = gate_value[frame_idx + frame_size * 2];
+  T r_out_grad = output_grad[frame_idx];
+
+  if (prev_out_value && prev_out_grad) {
+    if (is_batch) prev_out_value += batch_idx * frame_size;
+    r_prev_out_value = prev_out_value[frame_idx];
+
+    if (is_batch) prev_out_grad += batch_idx * frame_size;
+    r_prev_out_grad = prev_out_grad[frame_idx];
+  }
+
+  op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                r_out_grad, active_node);
+
+  gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
+  gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad;
+  if (prev_out_grad) {
+    prev_out_grad[frame_idx] = r_prev_out_grad;
+  }
+}
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <class OpResetGrad, bool is_batch, typename T>
+__global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
+                                       T *gate_grad, T *prev_out_value,
+                                       T *prev_out_grad, T *reset_output_grad,
+                                       int frame_size, int batch_size,
+                                       ActivationType active_gate) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    gate_grad += batch_idx * 3 * frame_size;
+    reset_output_grad += batch_idx * frame_size;
+  }
+
+  T r_reset_gate_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T r_reset_output_grad = 0;
+  T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T r_update_gate_grad = gate_grad[frame_idx + frame_size * 0];
+  T r_reset_gate_value = gate_value[frame_idx + frame_size * 1];
+
+  if (prev_out_value && prev_out_grad) {
+    if (is_batch) prev_out_value += batch_idx * frame_size;
+    if (is_batch) prev_out_grad += batch_idx * frame_size;
+    r_prev_out_value = prev_out_value[frame_idx];
+    r_prev_out_grad = prev_out_grad[frame_idx];
+    r_reset_output_grad = reset_output_grad[frame_idx];
+  }
+
+  op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                r_reset_output_grad, active_gate);
+
+  gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
+  gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad;
+  if (prev_out_grad) {
+    prev_out_grad[frame_idx] = r_prev_out_grad;
+  }
+}
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/detail/gru_kernel.h b/paddle/fluid/operators/math/detail/gru_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..705787e2ff76630fbcb23e646c91f74fa2feea24
--- /dev/null
+++ b/paddle/fluid/operators/math/detail/gru_kernel.h
@@ -0,0 +1,163 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+#include <type_traits>
+
+// TODO(guosheng): refine code style in gru_kernel
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+namespace forward {
+
+template <typename T>
+class gru_resetOutput {
+ public:
+  HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate,
+                             T &prev_out, T &value_reset_output,
+                             ActivationType act_gate) {
+    value_update_gate = activation(value_update_gate, act_gate);
+    value_reset_gate = activation(value_reset_gate, act_gate);
+    value_reset_output = prev_out * value_reset_gate;
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &value_reset_gate, __m256 &prev_out,
+                             __m256 &value_reset_output,
+                             ActivationType act_gate) {
+    value_update_gate = activation(value_update_gate, act_gate);
+    value_reset_gate = activation(value_reset_gate, act_gate);
+    value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate);
+  }
+#endif
+#endif
+};
+
+template <typename T>
+class gru_finalOutput {
+ public:
+  HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state,
+                             T &prev_out, T &value_output,
+                             ActivationType act_input) {
+    value_frame_state = activation(value_frame_state, act_input);
+    value_output = prev_out - (value_update_gate * prev_out) +
+                   (value_update_gate * value_frame_state);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &value_frame_state, __m256 &prev_out,
+                             __m256 &value_output, ActivationType act_input) {
+    value_frame_state = activation(value_frame_state, act_input);
+    value_output = _mm256_add_ps(
+        _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)),
+        _mm256_mul_ps(value_update_gate, value_frame_state));
+  }
+#endif
+#endif
+};
+}  // namespace forward
+
+namespace backward {
+
+template <typename T>
+class gru_stateGrad {
+ public:
+  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
+                             T &value_frame_state, T &grad_frame_state,
+                             T &value_prev_out, T &grad_prev_out,
+                             T &grad_output, ActivationType act_input) {
+    grad_update_gate = (grad_output * value_frame_state);
+    grad_update_gate -= (grad_output * value_prev_out);
+    grad_prev_out -= (grad_output * value_update_gate);
+    grad_prev_out += grad_output;
+    grad_frame_state = activation(grad_output * value_update_gate,
+                                  value_frame_state, act_input);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &grad_update_gate,
+                             __m256 &value_frame_state,
+                             __m256 &grad_frame_state, __m256 &value_prev_out,
+                             __m256 &grad_prev_out, __m256 &grad_output,
+                             ActivationType act_input) {
+    grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state);
+    grad_update_gate = _mm256_sub_ps(
+        grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out));
+    grad_prev_out = _mm256_add_ps(
+        _mm256_sub_ps(grad_prev_out,
+                      _mm256_mul_ps(grad_output, value_update_gate)),
+        grad_output);
+    grad_frame_state = activation(_mm256_mul_ps(grad_output, value_update_gate),
+                                  value_frame_state, act_input);
+  }
+#endif
+#endif
+};
+
+template <typename T>
+class gru_resetGrad {
+ public:
+  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
+                             T &value_reset_gate, T &grad_reset_gate,
+                             T &value_prev_out, T &grad_prev_out,
+                             T &grad_reset_output, ActivationType act_gate) {
+    grad_reset_gate = (grad_reset_output * value_prev_out);
+    grad_prev_out += (grad_reset_output * value_reset_gate);
+    grad_update_gate =
+        activation(grad_update_gate, value_update_gate, act_gate);
+    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &grad_update_gate, __m256 &value_reset_gate,
+                             __m256 &grad_reset_gate, __m256 &value_prev_out,
+                             __m256 &grad_prev_out, __m256 &grad_reset_output,
+                             ActivationType act_gate) {
+    grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out);
+    grad_prev_out = _mm256_add_ps(
+        grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate));
+    grad_update_gate =
+        activation(grad_update_gate, value_update_gate, act_gate);
+    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
+  }
+#endif
+#endif
+};
+
+}  // namespace backward
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf26509ba17774f55c4f7592ff3afb7bcfcaa336
--- /dev/null
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
@@ -0,0 +1,312 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/lstm_compute.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+#ifndef __NVCC__
+
+template <class T, class Op>
+void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
+                                     int frame_size, ActivationType active_node,
+                                     ActivationType active_gate,
+                                     ActivationType active_state) {
+  T r_value_in;
+  T r_value_ig;
+  T r_value_fg;
+  T r_value_og;
+  T r_checkI;
+  T r_checkF;
+  T r_checkO;
+  T r_state;
+  T r_prev_state = 0;
+  T r_state_atv;
+  T r_out;
+
+  T *value_in = value.gate_value;
+  T *value_ig = value.gate_value + frame_size;
+  T *value_fg = value.gate_value + frame_size * 2;
+  T *value_og = value.gate_value + frame_size * 3;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    r_checkI = value.check_ig ? value.check_ig[i] : 0;
+    r_checkF = value.check_fg ? value.check_fg[i] : 0;
+    r_checkO = value.check_og ? value.check_og[i] : 0;
+
+    if (value.prev_state_value) {
+      r_prev_state = value.prev_state_value[i];
+    }
+
+    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
+       r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node,
+       active_gate, active_state);
+
+    value_in[i] = r_value_in;
+    value_ig[i] = r_value_ig;
+    value_fg[i] = r_value_fg;
+    value_og[i] = r_value_og;
+    value.state_value[i] = r_state;
+    value.state_active_value[i] = r_state_atv;
+    value.output_value[i] = r_out;
+  }
+}
+
+template <class T, class Op>
+void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
+                                      LstmMetaGrad<T> grad, int frame_size,
+                                      ActivationType active_node,
+                                      ActivationType active_gate,
+                                      ActivationType active_state) {
+  T r_value_in;
+  T r_value_ig;
+  T r_value_fg;
+  T r_value_og;
+  T r_grad_in;
+  T r_grad_ig;
+  T r_grad_fg;
+  T r_grad_og;
+  T r_prev_state = 0;
+  T r_prev_state_grad;
+  T r_state;
+  T r_state_grad;
+  T r_state_atv;
+  T r_output_grad;
+  T r_checkI;
+  T r_checkF;
+  T r_checkO;
+  T r_checkIGrad;
+  T r_checkFGrad;
+  T r_checkOGrad;
+
+  T *value_in = value.gate_value;
+  T *value_ig = value.gate_value + frame_size;
+  T *value_fg = value.gate_value + frame_size * 2;
+  T *value_og = value.gate_value + frame_size * 3;
+  T *grad_in = grad.gate_grad;
+  T *grad_ig = grad.gate_grad + frame_size;
+  T *grad_fg = grad.gate_grad + frame_size * 2;
+  T *grad_og = grad.gate_grad + frame_size * 3;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    r_checkI = value.check_ig ? value.check_ig[i] : 0;
+    r_checkF = value.check_fg ? value.check_fg[i] : 0;
+    r_checkO = value.check_og ? value.check_og[i] : 0;
+    r_state = value.state_value[i];
+    r_state_atv = value.state_active_value[i];
+    r_output_grad = grad.output_grad[i];
+    r_state_grad = grad.state_grad[i];
+    if (value.prev_state_value) {
+      r_prev_state = value.prev_state_value[i];
+    }
+
+    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
+       r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
+       r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
+       r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
+       active_state);
+
+    grad_in[i] = r_grad_in;
+    grad_ig[i] = r_grad_ig;
+    grad_fg[i] = r_grad_fg;
+    grad_og[i] = r_grad_og;
+    grad.state_grad[i] = r_state_grad;
+
+    if (grad.prev_state_grad) grad.prev_state_grad[i] = r_prev_state_grad;
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad) grad.check_ig_grad[i] += r_checkIGrad;
+      if (grad.check_fg_grad) grad.check_fg_grad[i] += r_checkFGrad;
+    }
+    if (grad.check_og_grad) grad.check_og_grad[i] += r_checkOGrad;
+  }
+}
+
+template <class T, class Op>
+void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
+                                   int frame_size, ActivationType active_node,
+                                   ActivationType active_gate,
+                                   ActivationType active_state) {
+#ifdef __AVX__
+  __m256 r_value_in;
+  __m256 r_value_ig;
+  __m256 r_value_fg;
+  __m256 r_value_og;
+  __m256 r_checkI = _mm256_set1_ps(0.0f);
+  __m256 r_checkF = _mm256_set1_ps(0.0f);
+  __m256 r_checkO = _mm256_set1_ps(0.0f);
+  __m256 r_state;
+  __m256 r_prev_state = _mm256_set1_ps(0.0f);
+  __m256 r_state_atv;
+  __m256 r_out;
+
+  __m256 *value_in = (__m256 *)value.gate_value;
+  __m256 *value_ig = (__m256 *)(value.gate_value + frame_size);
+  __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2);
+  __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    if (value.check_ig) {
+      r_checkI = ((__m256 *)value.check_ig)[i];
+      r_checkF = ((__m256 *)value.check_fg)[i];
+      r_checkO = ((__m256 *)value.check_og)[i];
+    }
+
+    if (value.prev_state_value) {
+      r_prev_state = ((__m256 *)value.prev_state_value)[i];
+    }
+
+    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
+       r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node,
+       active_gate, active_state);
+
+    value_in[i] = r_value_in;
+    value_ig[i] = r_value_ig;
+    value_fg[i] = r_value_fg;
+    value_og[i] = r_value_og;
+    ((__m256 *)value.state_value)[i] = r_state;
+    ((__m256 *)value.state_active_value)[i] = r_state_atv;
+    ((__m256 *)value.output_value)[i] = r_out;
+  }
+#endif
+}
+
+template <class T, class Op>
+void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
+                                    LstmMetaGrad<T> grad, int frame_size,
+                                    ActivationType active_node,
+                                    ActivationType active_gate,
+                                    ActivationType active_state) {
+#ifdef __AVX__
+  __m256 r_value_in;
+  __m256 r_value_ig;
+  __m256 r_value_fg;
+  __m256 r_value_og;
+  __m256 r_grad_in;
+  __m256 r_grad_ig;
+  __m256 r_grad_fg;
+  __m256 r_grad_og;
+  __m256 r_prev_state = _mm256_set1_ps(0.0f);
+  __m256 r_prev_state_grad;
+  __m256 r_state_grad;
+  __m256 r_state;
+  __m256 r_state_atv;
+  __m256 r_output_grad;
+  __m256 r_checkI = _mm256_set1_ps(0.0f);
+  __m256 r_checkF = _mm256_set1_ps(0.0f);
+  __m256 r_checkO = _mm256_set1_ps(0.0f);
+  __m256 r_checkIGrad;
+  __m256 r_checkFGrad;
+  __m256 r_checkOGrad;
+
+  __m256 *value_in = (__m256 *)value.gate_value;
+  __m256 *value_ig = (__m256 *)(value.gate_value + frame_size);
+  __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2);
+  __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3);
+  __m256 *grad_in = (__m256 *)grad.gate_grad;
+  __m256 *grad_ig = (__m256 *)(grad.gate_grad + frame_size);
+  __m256 *grad_fg = (__m256 *)(grad.gate_grad + frame_size * 2);
+  __m256 *grad_og = (__m256 *)(grad.gate_grad + frame_size * 3);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    if (value.check_ig) {
+      r_checkI = ((__m256 *)value.check_ig)[i];
+      r_checkF = ((__m256 *)value.check_fg)[i];
+      r_checkO = ((__m256 *)value.check_og)[i];
+    }
+    r_state = ((__m256 *)value.state_value)[i];
+    r_state_atv = ((__m256 *)value.state_active_value)[i];
+    r_output_grad = ((__m256 *)grad.output_grad)[i];
+    r_state_grad = ((__m256 *)grad.state_grad)[i];
+    if (value.prev_state_value) {
+      r_prev_state = ((__m256 *)value.prev_state_value)[i];
+    }
+
+    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
+       r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
+       r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
+       r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
+       active_state);
+
+    grad_in[i] = r_grad_in;
+    grad_ig[i] = r_grad_ig;
+    grad_fg[i] = r_grad_fg;
+    grad_og[i] = r_grad_og;
+    ((__m256 *)grad.state_grad)[i] = r_state_grad;
+
+    if (grad.prev_state_grad)
+      ((__m256 *)grad.prev_state_grad)[i] = r_prev_state_grad;
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad) ((__m256 *)grad.check_ig_grad)[i] += r_checkIGrad;
+      if (grad.check_fg_grad) ((__m256 *)grad.check_fg_grad)[i] += r_checkFGrad;
+    }
+    if (grad.check_og_grad) ((__m256 *)grad.check_og_grad)[i] += r_checkOGrad;
+  }
+#endif
+}
+
+template <class T, class Op>
+void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frame_size,
+                      ActivationType active_node, ActivationType active_gate,
+                      ActivationType active_state) {
+  if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
+    avx_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
+                                     active_gate, active_state);
+  } else {
+    naive_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
+                                       active_gate, active_state);
+  }
+}
+
+template <class T, class Op>
+void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
+                       int frame_size, ActivationType active_node,
+                       ActivationType active_gate,
+                       ActivationType active_state) {
+  if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
+    avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, active_node,
+                                      active_gate, active_state);
+  } else {
+    naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size,
+                                        active_node, active_gate, active_state);
+  }
+}
+
+#endif
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..7865d0c0ba12c6150d87ea22e9c597e90b57e1ba
--- /dev/null
+++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
@@ -0,0 +1,255 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/lstm_compute.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/device_context.h"
+
+#include <type_traits>
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <class T, class Op, bool is_batch>
+__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
+                              int batch_size, ActivationType active_node,
+                              ActivationType active_gate,
+                              ActivationType active_state) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    value.gate_value += batch_idx * frame_size * 4;
+    value.output_value += batch_idx * frame_size;
+    value.state_value += batch_idx * frame_size;
+    value.state_active_value += batch_idx * frame_size;
+  }
+
+  T r_state;
+  T r_prev_state = 0;
+  T r_state_atv;
+  T r_out;
+  T r_value_in;
+  T r_value_ig;
+  T r_value_fg;
+  T r_value_og;
+
+  T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0;
+  T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0;
+  T r_checkO = value.check_og ? value.check_og[frame_idx] : 0;
+
+  r_value_in = value.gate_value[frame_idx];
+  r_value_ig = value.gate_value[frame_idx + frame_size];
+  r_value_fg = value.gate_value[frame_idx + frame_size * 2];
+  r_value_og = value.gate_value[frame_idx + frame_size * 3];
+
+  if (value.prev_state_value) {
+    if (is_batch) value.prev_state_value += batch_idx * frame_size;
+    r_prev_state = value.prev_state_value[frame_idx];
+  }
+
+  op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
+     r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, active_gate,
+     active_state);
+
+  value.gate_value[frame_idx] = r_value_in;
+  value.gate_value[frame_idx + frame_size] = r_value_ig;
+  value.gate_value[frame_idx + frame_size * 2] = r_value_fg;
+  value.gate_value[frame_idx + frame_size * 3] = r_value_og;
+
+  value.state_value[frame_idx] = r_state;
+  value.state_active_value[frame_idx] = r_state_atv;
+  value.output_value[frame_idx] = r_out;
+}
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <class T, class Op, bool is_batch>
+__global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
+                               LstmMetaGrad<T> grad, int frame_size,
+                               int batch_size, ActivationType active_node,
+                               ActivationType active_gate,
+                               ActivationType active_state) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    value.gate_value += batch_idx * frame_size * 4;
+    value.state_value += batch_idx * frame_size;
+    value.state_active_value += batch_idx * frame_size;
+    grad.gate_grad += batch_idx * frame_size * 4;
+    grad.state_grad += batch_idx * frame_size;
+    grad.output_grad += batch_idx * frame_size;
+  }
+
+  T r_value_in;
+  T r_value_ig;
+  T r_value_fg;
+  T r_value_og;
+  T r_grad_in;
+  T r_grad_ig;
+  T r_grad_fg;
+  T r_grad_og;
+  T r_prev_state = 0;
+  T r_prev_state_grad;
+  T r_state;
+  T r_state_grad;
+  T r_state_atv;
+  T r_output_grad;
+  T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0;
+  T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0;
+  T r_checkO = value.check_og ? value.check_og[frame_idx] : 0;
+
+  T r_checkIGrad;
+  T r_checkFGrad;
+  T r_checkOGrad;
+
+  r_value_in = value.gate_value[frame_idx];
+  r_value_ig = value.gate_value[frame_idx + frame_size];
+  r_value_fg = value.gate_value[frame_idx + frame_size * 2];
+  r_value_og = value.gate_value[frame_idx + frame_size * 3];
+  r_state = value.state_value[frame_idx];
+  r_state_atv = value.state_active_value[frame_idx];
+  r_output_grad = grad.output_grad[frame_idx];
+  r_state_grad = grad.state_grad[frame_idx];
+
+  if (value.prev_state_value) {
+    if (is_batch) value.prev_state_value += batch_idx * frame_size;
+    r_prev_state = value.prev_state_value[frame_idx];
+  }
+
+  op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
+     r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
+     r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
+     r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
+     active_state);
+
+  grad.gate_grad[frame_idx] = r_grad_in;
+  grad.gate_grad[frame_idx + frame_size] = r_grad_ig;
+  grad.gate_grad[frame_idx + frame_size * 2] = r_grad_fg;
+  grad.gate_grad[frame_idx + frame_size * 3] = r_grad_og;
+  grad.state_grad[frame_idx] = r_state_grad;
+  if (grad.prev_state_grad) {
+    if (is_batch) grad.prev_state_grad += batch_idx * frame_size;
+    grad.prev_state_grad[frame_idx] = r_prev_state_grad;
+  }
+
+  if (is_batch) {
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad)
+        paddle::platform::CudaAtomicAdd(grad.check_ig_grad + frame_idx,
+                                        r_checkIGrad);
+      if (grad.check_fg_grad)
+        paddle::platform::CudaAtomicAdd(grad.check_fg_grad + frame_idx,
+                                        r_checkFGrad);
+    }
+    if (grad.check_og_grad)
+      paddle::platform::CudaAtomicAdd(grad.check_og_grad + frame_idx,
+                                      r_checkOGrad);
+  } else {
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad) grad.check_ig_grad[frame_idx] += r_checkIGrad;
+      if (grad.check_fg_grad) grad.check_fg_grad[frame_idx] += r_checkFGrad;
+    }
+    if (grad.check_og_grad) grad.check_og_grad[frame_idx] += r_checkOGrad;
+  }
+}
+
+template <class T, class Op>
+void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
+                      LstmMetaValue<T> value, int frame_size, int batch_size,
+                      ActivationType active_node, ActivationType active_gate,
+                      ActivationType active_state) {
+  dim3 threads;
+  dim3 grid;
+  if (batch_size == 1) {
+    int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+    int frame_blocks = (frame_size + 1024 - 1) / 1024;
+    threads = dim3(frame_per_block, 1);
+    grid = dim3(frame_blocks, 1);
+  } else {
+    /* frame_per_block = 32 batch_per_block = 32 */
+    threads = dim3(32, 32);
+    grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
+  }
+
+  auto stream =
+      reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+  if (batch_size == 1) {
+    KeLstmForward<T, Op,
+                  /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+        op, value, frame_size, batch_size, active_node, active_gate,
+        active_state);
+  } else {
+    KeLstmForward<T, Op,
+                  /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+        op, value, frame_size, batch_size, active_node, active_gate,
+        active_state);
+  }
+}
+
+template <class T, class Op>
+void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
+                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
+                       int frame_size, int batch_size,
+                       ActivationType active_node, ActivationType active_gate,
+                       ActivationType active_state) {
+  dim3 threads;
+  dim3 grid;
+  if (batch_size == 1) {
+    int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+    int frame_blocks = (frame_size + 1024 - 1) / 1024;
+    threads = dim3(frame_per_block, 1);
+    grid = dim3(frame_blocks, 1);
+  } else {
+    /* frame_per_block = 32 batch_per_block = 16 */
+    threads = dim3(32, 16);
+    grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16);
+  }
+
+  auto stream =
+      reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+  if (batch_size == 1) {
+    KeLstmBackward<T, Op,
+                   /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+        op, value, grad, frame_size, batch_size, active_node, active_gate,
+        active_state);
+  } else {
+    KeLstmBackward<T, Op,
+                   /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+        op, value, grad, frame_size, batch_size, active_node, active_gate,
+        active_state);
+  }
+}
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/detail/lstm_kernel.h b/paddle/fluid/operators/math/detail/lstm_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..0679cc62ba91fce540d9f6e227729a96a1553173
--- /dev/null
+++ b/paddle/fluid/operators/math/detail/lstm_kernel.h
@@ -0,0 +1,148 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+#include <type_traits>
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+namespace forward {
+
+template <class T>
+class lstm {
+ public:
+  HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
+                             T &prev_state, T &state, T &state_atv, T &output,
+                             T &checkI, T &checkF, T &checkO,
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
+    value_in = activation(value_in, active_node);
+    value_ig = activation(value_ig + prev_state * checkI, active_gate);
+    value_fg = activation(value_fg + prev_state * checkF, active_gate);
+    state = value_in * value_ig + prev_state * value_fg;
+    value_og = activation(value_og + state * checkO, active_gate);
+    state_atv = activation(state, active_state);
+    output = value_og * state_atv;
+  }
+#ifndef __NVCC__
+#ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
+  static const bool avx = false;
+#else
+  // Only float support AVX optimization
+  static const bool avx = std::is_same<T, float>::value;
+
+  HOSTDEVICE void operator()(__m256 &value_in, __m256 &value_ig,
+                             __m256 &value_fg, __m256 &value_og,
+                             __m256 &prev_state, __m256 &state,
+                             __m256 &state_atv, __m256 &output, __m256 &checkI,
+                             __m256 &checkF, __m256 &checkO,
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
+    value_in = activation(value_in, active_node);
+    value_ig =
+        activation(_mm256_add_ps(value_ig, _mm256_mul_ps(prev_state, checkI)),
+                   active_gate);
+    value_fg =
+        activation(_mm256_add_ps(value_fg, _mm256_mul_ps(prev_state, checkF)),
+                   active_gate);
+    state = _mm256_add_ps(_mm256_mul_ps(value_in, value_ig),
+                          _mm256_mul_ps(prev_state, value_fg));
+    value_og = activation(_mm256_add_ps(value_og, _mm256_mul_ps(state, checkO)),
+                          active_gate);
+    state_atv = activation(state, active_state);
+    output = _mm256_mul_ps(value_og, state_atv);
+  }
+#endif
+#endif
+};
+
+}  // namespace forward
+
+namespace backward {
+
+template <class T>
+class lstm {
+ public:
+  HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
+                             T &grad_in, T &grad_ig, T &grad_fg, T &grad_og,
+                             T &prev_state, T &prev_state_grad, T &state,
+                             T &state_grad, T &state_atv, T &output_grad,
+                             T &checkI, T &checkF, T &checkO, T &checkIGrad,
+                             T &checkFGrad, T &checkOGrad,
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
+    grad_og = activation(output_grad * state_atv, value_og, active_gate);
+    state_grad += activation(output_grad * value_og, state_atv, active_state) +
+                  grad_og * checkO;
+    grad_in = activation(state_grad * value_ig, value_in, active_node);
+    grad_ig = activation(state_grad * value_in, value_ig, active_gate);
+    grad_fg = activation(state_grad * prev_state, value_fg, active_gate);
+    prev_state_grad =
+        grad_ig * checkI + grad_fg * checkF + state_grad * value_fg;
+    checkIGrad = grad_ig * prev_state;
+    checkFGrad = grad_fg * prev_state;
+    checkOGrad = grad_og * state;
+  }
+#ifndef __NVCC__
+#ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
+  static const bool avx = false;
+#else
+  // Only float support AVX optimization
+  static const bool avx = std::is_same<T, float>::value;
+  HOSTDEVICE void operator()(
+      __m256 &value_in, __m256 &value_ig, __m256 &value_fg, __m256 &value_og,
+      __m256 &grad_in, __m256 &grad_ig, __m256 &grad_fg, __m256 &grad_og,
+      __m256 &prev_state, __m256 &prev_state_grad, __m256 &state,
+      __m256 &state_grad, __m256 &state_atv, __m256 &output_grad,
+      __m256 &checkI, __m256 &checkF, __m256 &checkO, __m256 &checkIGrad,
+      __m256 &checkFGrad, __m256 &checkOGrad, ActivationType active_node,
+      ActivationType active_gate, ActivationType active_state) {
+    grad_og = activation(_mm256_mul_ps(output_grad, state_atv), value_og,
+                         active_gate);
+    state_grad = _mm256_add_ps(activation(_mm256_mul_ps(output_grad, value_og),
+                                          state_atv, active_state),
+                               state_grad);
+    state_grad = _mm256_add_ps(_mm256_mul_ps(grad_og, checkO), state_grad);
+    grad_in =
+        activation(_mm256_mul_ps(state_grad, value_ig), value_in, active_node);
+    grad_ig =
+        activation(_mm256_mul_ps(state_grad, value_in), value_ig, active_gate);
+    grad_fg = activation(_mm256_mul_ps(state_grad, prev_state), value_fg,
+                         active_gate);
+    prev_state_grad = _mm256_add_ps(_mm256_mul_ps(grad_ig, checkI),
+                                    _mm256_mul_ps(grad_fg, checkF));
+    prev_state_grad =
+        _mm256_add_ps(_mm256_mul_ps(state_grad, value_fg), prev_state_grad);
+    checkIGrad = _mm256_mul_ps(grad_ig, prev_state);
+    checkFGrad = _mm256_mul_ps(grad_fg, prev_state);
+    checkOGrad = _mm256_mul_ps(grad_og, state);
+  }
+#endif
+#endif
+};
+
+}  // namespace backward
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/detection_util.h b/paddle/fluid/operators/math/detection_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..13e5d406c11a10dc87533a2ca07d14f4684446f5
--- /dev/null
+++ b/paddle/fluid/operators/math/detection_util.h
@@ -0,0 +1,300 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <map>
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+struct BBox {
+  BBox(T x_min, T y_min, T x_max, T y_max)
+      : x_min(x_min),
+        y_min(y_min),
+        x_max(x_max),
+        y_max(y_max),
+        is_difficult(false) {}
+
+  BBox() {}
+
+  T get_width() const { return x_max - x_min; }
+
+  T get_height() const { return y_max - y_min; }
+
+  T get_center_x() const { return (x_min + x_max) / 2; }
+
+  T get_center_y() const { return (y_min + y_max) / 2; }
+
+  T get_area() const { return get_width() * get_height(); }
+
+  // coordinate of bounding box
+  T x_min;
+  T y_min;
+  T x_max;
+  T y_max;
+  // whether difficult object (e.g. object with heavy occlusion is difficult)
+  bool is_difficult;
+};
+// KNCHW ==> NHWC
+// template <typename T>
+template <typename T>
+void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes,
+                          std::vector<BBox<T>>& bbox_vec);
+template <typename T>
+void GetBBoxVarFromPriorData(const T* prior_data, const size_t num,
+                             std::vector<std::vector<T>>& var_vec);
+template <typename T>
+BBox<T> DecodeBBoxWithVar(BBox<T>& prior_bbox,
+                          const std::vector<T>& prior_bbox_var,
+                          const std::vector<T>& loc_pred_data);
+template <typename T1, typename T2>
+bool SortScorePairDescend(const std::pair<T1, T2>& pair1,
+                          const std::pair<T1, T2>& pair2);
+template <typename T>
+bool SortScorePairDescend(const std::pair<T, BBox<T>>& pair1,
+                          const std::pair<T, BBox<T>>& pair2);
+template <typename T>
+T jaccard_overlap(const BBox<T>& bbox1, const BBox<T>& bbox2);
+
+template <typename T>
+void ApplyNmsFast(const std::vector<BBox<T>>& bboxes, const T* conf_score_data,
+                  size_t class_idx, size_t top_k, T conf_threshold,
+                  T nms_threshold, size_t num_priors, size_t num_classes,
+                  std::vector<size_t>* indices);
+template <typename T>
+int GetDetectionIndices(
+    const T* conf_data, const size_t num_priors, const size_t num_classes,
+    const size_t background_label_id, const size_t batch_size,
+    const T conf_threshold, const size_t nms_top_k, const T nms_threshold,
+    const size_t top_k,
+    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes,
+    std::vector<std::map<size_t, std::vector<size_t>>>* all_detection_indices);
+template <typename T>
+BBox<T> ClipBBox(const BBox<T>& bbox);
+template <typename T>
+void GetDetectionOutput(
+    const T* conf_data, const size_t num_kept, const size_t num_priors,
+    const size_t num_classes, const size_t batch_size,
+    const std::vector<std::map<size_t, std::vector<size_t>>>& all_indices,
+    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes, T* out_data);
+template <typename T>
+void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes,
+                          std::vector<BBox<T>>& bbox_vec) {
+  size_t out_offset = bbox_vec.size();
+  bbox_vec.resize(bbox_vec.size() + num_bboxes);
+  for (size_t i = 0; i < num_bboxes; ++i) {
+    BBox<T> bbox;
+    bbox.x_min = *(prior_data + i * 8);
+    bbox.y_min = *(prior_data + i * 8 + 1);
+    bbox.x_max = *(prior_data + i * 8 + 2);
+    bbox.y_max = *(prior_data + i * 8 + 3);
+    bbox_vec[out_offset + i] = bbox;
+  }
+}
+template <typename T>
+void GetBBoxVarFromPriorData(const T* prior_data, const size_t num,
+                             std::vector<std::vector<T>>& var_vec) {
+  size_t out_offset = var_vec.size();
+  var_vec.resize(var_vec.size() + num);
+  for (size_t i = 0; i < num; ++i) {
+    std::vector<T> var;
+    var.push_back(*(prior_data + i * 8 + 4));
+    var.push_back(*(prior_data + i * 8 + 5));
+    var.push_back(*(prior_data + i * 8 + 6));
+    var.push_back(*(prior_data + i * 8 + 7));
+    var_vec[out_offset + i] = var;
+  }
+}
+template <typename T>
+BBox<T> DecodeBBoxWithVar(BBox<T>& prior_bbox,
+                          const std::vector<T>& prior_bbox_var,
+                          const std::vector<T>& loc_pred_data) {
+  T prior_bbox_width = prior_bbox.get_width();
+  T prior_bbox_height = prior_bbox.get_height();
+  T prior_bbox_center_x = prior_bbox.get_center_x();
+  T prior_bbox_center_y = prior_bbox.get_center_y();
+
+  T decoded_bbox_center_x =
+      prior_bbox_var[0] * loc_pred_data[0] * prior_bbox_width +
+      prior_bbox_center_x;
+  T decoded_bbox_center_y =
+      prior_bbox_var[1] * loc_pred_data[1] * prior_bbox_height +
+      prior_bbox_center_y;
+  T decoded_bbox_width =
+      std::exp(prior_bbox_var[2] * loc_pred_data[2]) * prior_bbox_width;
+  T decoded_bbox_height =
+      std::exp(prior_bbox_var[3] * loc_pred_data[3]) * prior_bbox_height;
+
+  BBox<T> decoded_bbox;
+  decoded_bbox.x_min = decoded_bbox_center_x - decoded_bbox_width / 2;
+  decoded_bbox.y_min = decoded_bbox_center_y - decoded_bbox_height / 2;
+  decoded_bbox.x_max = decoded_bbox_center_x + decoded_bbox_width / 2;
+  decoded_bbox.y_max = decoded_bbox_center_y + decoded_bbox_height / 2;
+
+  return decoded_bbox;
+}
+template <typename T1, typename T2>
+bool SortScorePairDescend(const std::pair<T1, T2>& pair1,
+                          const std::pair<T1, T2>& pair2) {
+  return pair1.first > pair2.first;
+}
+template <typename T>
+T jaccard_overlap(const BBox<T>& bbox1, const BBox<T>& bbox2) {
+  if (bbox2.x_min > bbox1.x_max || bbox2.x_max < bbox1.x_min ||
+      bbox2.y_min > bbox1.y_max || bbox2.y_max < bbox1.y_min) {
+    return 0.0;
+  } else {
+    T inter_x_min = std::max(bbox1.x_min, bbox2.x_min);
+    T inter_y_min = std::max(bbox1.y_min, bbox2.y_min);
+    T interX_max = std::min(bbox1.x_max, bbox2.x_max);
+    T interY_max = std::min(bbox1.y_max, bbox2.y_max);
+
+    T inter_width = interX_max - inter_x_min;
+    T inter_height = interY_max - inter_y_min;
+    T inter_area = inter_width * inter_height;
+
+    T bbox_area1 = bbox1.get_area();
+    T bbox_area2 = bbox2.get_area();
+
+    return inter_area / (bbox_area1 + bbox_area2 - inter_area);
+  }
+}
+
+template <typename T>
+void ApplyNmsFast(const std::vector<BBox<T>>& bboxes, const T* conf_score_data,
+                  size_t class_idx, size_t top_k, T conf_threshold,
+                  T nms_threshold, size_t num_priors, size_t num_classes,
+                  std::vector<size_t>* indices) {
+  std::vector<std::pair<T, size_t>> scores;
+  for (size_t i = 0; i < num_priors; ++i) {
+    size_t conf_offset = i * num_classes + class_idx;
+    if (conf_score_data[conf_offset] > conf_threshold)
+      scores.push_back(std::make_pair(conf_score_data[conf_offset], i));
+  }
+  std::stable_sort(scores.begin(), scores.end(),
+                   SortScorePairDescend<T, size_t>);
+  if (top_k > 0 && top_k < scores.size()) scores.resize(top_k);
+  while (scores.size() > 0) {
+    const size_t idx = scores.front().second;
+    bool keep = true;
+    for (size_t i = 0; i < indices->size(); ++i) {
+      if (keep) {
+        const size_t saved_idx = (*indices)[i];
+        T overlap = jaccard_overlap<T>(bboxes[idx], bboxes[saved_idx]);
+        keep = overlap <= nms_threshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) indices->push_back(idx);
+    scores.erase(scores.begin());
+  }
+}
+template <typename T>
+int GetDetectionIndices(
+    const T* conf_data, const size_t num_priors, const size_t num_classes,
+    const size_t background_label_id, const size_t batch_size,
+    const T conf_threshold, const size_t nms_top_k, const T nms_threshold,
+    const size_t top_k,
+    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes,
+    std::vector<std::map<size_t, std::vector<size_t>>>* all_detection_indices) {
+  int total_keep_num = 0;
+  for (size_t n = 0; n < batch_size; ++n) {
+    const std::vector<BBox<T>>& decoded_bboxes = all_decoded_bboxes[n];
+    size_t num_detected = 0;
+    std::map<size_t, std::vector<size_t>> indices;
+    size_t conf_offset = n * num_priors * num_classes;
+    for (size_t c = 0; c < num_classes; ++c) {
+      if (c == background_label_id) continue;
+      ApplyNmsFast<T>(decoded_bboxes, conf_data + conf_offset, c, nms_top_k,
+                      conf_threshold, nms_threshold, num_priors, num_classes,
+                      &(indices[c]));
+      num_detected += indices[c].size();
+    }
+    if (top_k > 0 && num_detected > top_k) {
+      // std::vector<pair<T,T>> score_index_pairs;
+      std::vector<std::pair<T, std::pair<size_t, size_t>>> score_index_pairs;
+      for (size_t c = 0; c < num_classes; ++c) {
+        const std::vector<size_t>& label_indices = indices[c];
+        for (size_t i = 0; i < label_indices.size(); ++i) {
+          size_t idx = label_indices[i];
+          score_index_pairs.push_back(
+              std::make_pair((conf_data + conf_offset)[idx * num_classes + c],
+                             std::make_pair(c, idx)));
+        }
+      }
+      std::sort(score_index_pairs.begin(), score_index_pairs.end(),
+                SortScorePairDescend<T, std::pair<size_t, size_t>>);
+      score_index_pairs.resize(top_k);
+      std::map<size_t, std::vector<size_t>> new_indices;
+      for (size_t i = 0; i < score_index_pairs.size(); ++i) {
+        size_t label = score_index_pairs[i].second.first;
+        size_t idx = score_index_pairs[i].second.second;
+        new_indices[label].push_back(idx);
+      }
+      all_detection_indices->push_back(new_indices);
+      total_keep_num += top_k;
+    } else {
+      all_detection_indices->push_back(indices);
+      total_keep_num += num_detected;
+    }
+  }
+  return total_keep_num;
+}
+template <typename T>
+BBox<T> ClipBBox(const BBox<T>& bbox) {
+  T one = static_cast<T>(1.0);
+  T zero = static_cast<T>(0.0);
+  BBox<T> clipped_bbox;
+  clipped_bbox.x_min = std::max(std::min(bbox.x_min, one), zero);
+  clipped_bbox.y_min = std::max(std::min(bbox.y_min, one), zero);
+  clipped_bbox.x_max = std::max(std::min(bbox.x_max, one), zero);
+  clipped_bbox.y_max = std::max(std::min(bbox.y_max, one), zero);
+  return clipped_bbox;
+}
+template <typename T>
+void GetDetectionOutput(
+    const T* conf_data, const size_t num_kept, const size_t num_priors,
+    const size_t num_classes, const size_t batch_size,
+    const std::vector<std::map<size_t, std::vector<size_t>>>& all_indices,
+    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes, T* out_data) {
+  size_t count = 0;
+  for (size_t n = 0; n < batch_size; ++n) {
+    for (std::map<size_t, std::vector<size_t>>::const_iterator it =
+             all_indices[n].begin();
+         it != all_indices[n].end(); ++it) {
+      size_t label = it->first;
+      const std::vector<size_t>& indices = it->second;
+      const std::vector<BBox<T>>& decoded_bboxes = all_decoded_bboxes[n];
+      for (size_t i = 0; i < indices.size(); ++i) {
+        size_t idx = indices[i];
+        size_t conf_offset = n * num_priors * num_classes + idx * num_classes;
+        out_data[count * 7] = n;
+        out_data[count * 7 + 1] = label;
+        out_data[count * 7 + 2] = (conf_data + conf_offset)[label];
+        BBox<T> clipped_bbox = ClipBBox<T>(decoded_bboxes[idx]);
+        out_data[count * 7 + 3] = clipped_bbox.x_min;
+        out_data[count * 7 + 4] = clipped_bbox.y_min;
+        out_data[count * 7 + 5] = clipped_bbox.x_max;
+        out_data[count * 7 + 6] = clipped_bbox.y_max;
+        ++count;
+      }
+    }
+  }
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..100318041679e38b37fd1ef1f071d4e682889756
--- /dev/null
+++ b/paddle/fluid/operators/math/gru_compute.cc
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/gru_compute.h"
+#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
+#include "paddle/fluid/operators/math/detail/gru_kernel.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext &context,
+                      GRUMetaValue<T> value, int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
+#ifndef __NVCC__
+    if (value.prev_out_value) {
+      math::gemm<platform::CPUDeviceContext, T>(
+          context, false, false, batch_size, frame_size * 2, frame_size, 1,
+          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
+          1, value.gate_value, frame_size * 3);
+    }
+
+    detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
+                                 frame_size, batch_size, active_gate);
+
+    if (value.prev_out_value) {
+      math::gemm<platform::CPUDeviceContext, T>(
+          context, false, false, batch_size, frame_size, frame_size, 1,
+          value.reset_output_value, frame_size, value.state_weight, frame_size,
+          1, value.gate_value + frame_size * 2, frame_size * 3);
+    }
+
+    detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
+                                 frame_size, batch_size, active_node);
+#endif
+  }
+};
+
+template <typename T>
+struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext &context,
+                      GRUMetaValue<T> value, GRUMetaGrad<T> grad,
+                      int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
+#ifndef __NVCC__
+    detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
+                                grad, frame_size, batch_size, active_node);
+
+    if (value.prev_out_value && grad.prev_out_grad) {
+      math::gemm<platform::CPUDeviceContext, T>(
+          context, false, true, batch_size, frame_size, frame_size, 1,
+          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
+          frame_size, 0, grad.reset_output_grad, frame_size);
+
+      if (grad.state_weight_grad) {
+        math::gemm<platform::CPUDeviceContext, T>(
+            context, true, false, frame_size, frame_size, batch_size, 1,
+            value.reset_output_value, frame_size,
+            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+            grad.state_weight_grad, frame_size);
+      }
+    }
+
+    detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
+                                grad, frame_size, batch_size, active_gate);
+
+    if (grad.prev_out_grad && value.prev_out_value) {
+      math::gemm<platform::CPUDeviceContext, T>(
+          context, false, true, batch_size, frame_size, frame_size * 2, 1,
+          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
+          grad.prev_out_grad, frame_size);
+
+      if (grad.gate_weight_grad) {
+        math::gemm<platform::CPUDeviceContext, T>(
+            context, true, false, frame_size, frame_size * 2, batch_size, 1,
+            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
+            grad.gate_weight_grad, frame_size * 2);
+      }
+    }
+#endif
+  }
+};
+
+template struct GRUUnitFunctor<platform::CPUDeviceContext, float>;
+template struct GRUUnitFunctor<platform::CPUDeviceContext, double>;
+template struct GRUUnitGradFunctor<platform::CPUDeviceContext, float>;
+template struct GRUUnitGradFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/gru_compute.cu b/paddle/fluid/operators/math/gru_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0d5d5d7a743150c397a5b8356d1b3add88c509b6
--- /dev/null
+++ b/paddle/fluid/operators/math/gru_compute.cu
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/detail/gru_gpu_kernel.h"
+#include "paddle/fluid/operators/math/detail/gru_kernel.h"
+#include "paddle/fluid/operators/math/gru_compute.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
+  static void compute(const platform::CUDADeviceContext &context,
+                      GRUMetaValue<T> value, int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
+    auto stream = context.stream();
+    dim3 threads;
+    dim3 grid;
+    if (batch_size == 1) {
+      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+      int frame_blocks = (frame_size + 1024 - 1) / 1024;
+      threads = dim3(frame_per_block, 1);
+      grid = dim3(frame_blocks, 1);
+    } else {
+      threads = dim3(32, 32);
+      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
+    }
+
+    if (value.prev_out_value) {
+      math::gemm<platform::CUDADeviceContext, T>(
+          context, false, false, batch_size, frame_size * 2, frame_size, 1,
+          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
+          1, value.gate_value, frame_size * 3);
+    }
+
+    if (batch_size == 1) {
+      detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
+                                      /* is_batch= */ false,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_resetOutput<T>(), value.gate_value,
+          value.reset_output_value, value.prev_out_value, frame_size,
+          batch_size, active_gate);
+    } else {
+      detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
+                                      /* is_batch= */ true,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_resetOutput<T>(), value.gate_value,
+          value.reset_output_value, value.prev_out_value, frame_size,
+          batch_size, active_gate);
+    }
+
+    if (value.prev_out_value) {
+      math::gemm<platform::CUDADeviceContext, T>(
+          context, false, false, batch_size, frame_size, frame_size, 1,
+          value.reset_output_value, frame_size, value.state_weight, frame_size,
+          1, value.gate_value + frame_size * 2, frame_size * 3);
+    }
+
+    if (batch_size == 1) {
+      detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
+                                      /* is_batch= */ false,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_finalOutput<T>(), value.gate_value,
+          value.prev_out_value, value.output_value, frame_size, batch_size,
+          active_node);
+    } else {
+      detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
+                                      /* is_batch= */ true,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_finalOutput<T>(), value.gate_value,
+          value.prev_out_value, value.output_value, frame_size, batch_size,
+          active_node);
+    }
+  }
+};
+
+template <typename T>
+struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
+  static void compute(const platform::CUDADeviceContext &context,
+                      GRUMetaValue<T> value, GRUMetaGrad<T> grad,
+                      int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
+    auto stream = context.stream();
+    dim3 threads;
+    dim3 grid;
+    if (batch_size == 1) {
+      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+      int frame_blocks = (frame_size + 1024 - 1) / 1024;
+      threads = dim3(frame_per_block, 1);
+      grid = dim3(frame_blocks, 1);
+    } else {
+      threads = dim3(32, 32);
+      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
+    }
+
+    if (batch_size == 1) {
+      detail::KeGruBackwardStateGrad<
+          detail::backward::gru_stateGrad<T>,
+          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_stateGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.output_grad, frame_size, batch_size, active_node);
+    } else {
+      detail::KeGruBackwardStateGrad<
+          detail::backward::gru_stateGrad<T>,
+          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_stateGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.output_grad, frame_size, batch_size, active_node);
+    }
+
+    if (value.prev_out_value && grad.prev_out_grad) {
+      math::gemm<platform::CUDADeviceContext, T>(
+          context, false, true, batch_size, frame_size, frame_size, 1,
+          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
+          frame_size, 0, grad.reset_output_grad, frame_size);
+
+      if (grad.state_weight_grad) {
+        math::gemm<platform::CUDADeviceContext, T>(
+            context, true, false, frame_size, frame_size, batch_size, 1,
+            value.reset_output_value, frame_size,
+            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+            grad.state_weight_grad, frame_size);
+      }
+    }
+
+    if (batch_size == 1) {
+      detail::KeGruBackwardResetGrad<
+          detail::backward::gru_resetGrad<T>,
+          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_resetGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.reset_output_grad, frame_size, batch_size, active_gate);
+    } else {
+      detail::KeGruBackwardResetGrad<
+          detail::backward::gru_resetGrad<T>,
+          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_resetGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.reset_output_grad, frame_size, batch_size, active_gate);
+    }
+
+    if (grad.prev_out_grad && value.prev_out_value) {
+      math::gemm<platform::CUDADeviceContext, T>(
+          context, false, true, batch_size, frame_size, frame_size * 2, 1,
+          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
+          grad.prev_out_grad, frame_size);
+
+      if (grad.gate_weight_grad) {
+        math::gemm<platform::CUDADeviceContext, T>(
+            context, true, false, frame_size, frame_size * 2, batch_size, 1,
+            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
+            grad.gate_weight_grad, frame_size * 2);
+      }
+    }
+  }
+};
+
+template struct GRUUnitFunctor<platform::CUDADeviceContext, float>;
+template struct GRUUnitFunctor<platform::CUDADeviceContext, double>;
+template struct GRUUnitGradFunctor<platform::CUDADeviceContext, float>;
+template struct GRUUnitGradFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/gru_compute.h b/paddle/fluid/operators/math/gru_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..93e19cf55782facfd95affdc77dcf78c511d8bbd
--- /dev/null
+++ b/paddle/fluid/operators/math/gru_compute.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct GRUMetaValue {
+  T *gate_weight;
+  T *state_weight;
+  T *gate_value;
+  T *reset_output_value;
+  T *output_value;
+  T *prev_out_value;
+};
+
+template <typename T>
+struct GRUMetaGrad {
+  T *gate_weight_grad;
+  T *state_weight_grad;
+  T *gate_grad;
+  T *reset_output_grad;
+  T *output_grad;
+  T *prev_out_grad;
+};
+
+template <typename DeviceContext, typename T>
+struct GRUUnitFunctor {
+  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
+                      int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate);
+};
+
+template <typename DeviceContext, typename T>
+struct GRUUnitGradFunctor {
+  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
+                      GRUMetaGrad<T> grad, int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c298b00bb4cb9df8f9a54b4420edb07aed9cf891
--- /dev/null
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -0,0 +1,313 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/im2col.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [input_channels, filter_height, filter_width, output_height, output_width]
+ */
+template <class T>
+class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                    platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& im, const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col) {
+    PADDLE_ENFORCE(im.dims().size() == 3);
+    PADDLE_ENFORCE(col->dims().size() == 5);
+
+    int im_channels = im.dims()[0];
+    int im_height = im.dims()[1];
+    int im_width = im.dims()[2];
+    int filter_height = col->dims()[1];
+    int filter_width = col->dims()[2];
+    int col_height = col->dims()[3];
+    int col_width = col->dims()[4];
+
+    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+                       ((dilation[0] * (filter_height - 1) + 1))) /
+                              stride[0] +
+                          1,
+                      col_height,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+                       ((dilation[1] * (filter_width - 1) + 1))) /
+                              stride[1] +
+                          1,
+                      col_width,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+
+    int channels_col = im_channels * filter_height * filter_width;
+
+    const T* im_data = im.data<T>();
+    T* col_data = col->data<T>();
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int c_im = c / (filter_width * filter_height);
+      for (int h = 0; h < col_height; ++h) {
+        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+        for (int w = 0; w < col_width; ++w) {
+          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+          int col_idx = (c * col_height + h) * col_width + w;
+          int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
+
+          col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
+                               im_col_idx < 0 || im_col_idx >= im_width)
+                                  ? static_cast<T>(0)
+                                  : im_data[im_idx];
+        }
+      }
+    }
+  }
+};
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [input_channels, filter_height, filter_width, output_height, output_width]
+ */
+template <class T>
+class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                    platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im) {
+    PADDLE_ENFORCE(im->dims().size() == 3);
+    PADDLE_ENFORCE(col.dims().size() == 5);
+    int im_channels = im->dims()[0];
+    int im_height = im->dims()[1];
+    int im_width = im->dims()[2];
+    int filter_height = col.dims()[1];
+    int filter_width = col.dims()[2];
+    int col_height = col.dims()[3];
+    int col_width = col.dims()[4];
+
+    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+                       ((dilation[0] * (filter_height - 1) + 1))) /
+                              stride[0] +
+                          1,
+                      col_height,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+                       ((dilation[1] * (filter_width - 1) + 1))) /
+                              stride[1] +
+                          1,
+                      col_width,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+
+    int channels_col = im_channels * filter_height * filter_width;
+
+    T* im_data = im->data<T>();
+    const T* col_data = col.data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int c_im = c / (filter_width * filter_height);
+      for (int h = 0; h < col_height; ++h) {
+        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+        for (int w = 0; w < col_width; ++w) {
+          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+          if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
+              (im_col_idx) >= 0 && (im_col_idx) < im_width) {
+            im_data[(im_row_idx + c_im * im_height) * im_width + im_col_idx] +=
+                col_data[(c * col_height + h) * col_width + w];
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CPUDeviceContext, float>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CPUDeviceContext, double>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CPUDeviceContext, float>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CPUDeviceContext, double>;
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [output_height, output_width, input_channels, filter_height, filter_width]
+ */
+template <class T>
+class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                    platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& im, const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col) {
+    PADDLE_ENFORCE(im.dims().size() == 3);
+    PADDLE_ENFORCE(col->dims().size() == 5);
+    int im_channels = im.dims()[0];
+    int im_height = im.dims()[1];
+    int im_width = im.dims()[2];
+    int filter_height = col->dims()[3];
+    int filter_width = col->dims()[4];
+    int col_height = col->dims()[0];
+    int col_width = col->dims()[1];
+
+    PADDLE_ENFORCE_EQ(
+        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
+        col_height,
+        "Output_height and padding(padding_up, padding_down) are "
+        "inconsistent.");
+    PADDLE_ENFORCE_EQ(
+        (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
+        col_width,
+        "col_width and padding(padding_left, padding_right) are "
+        "inconsistent.");
+
+    const T* im_data = im.data<T>();
+    T* col_data = col->data<T>();
+
+    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
+      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
+        for (int channel = 0; channel < im_channels; ++channel) {
+          for (int filter_row_idx = 0; filter_row_idx < filter_height;
+               ++filter_row_idx) {
+            int im_row_offset =
+                col_row_idx * stride[0] + filter_row_idx - padding[0];
+            for (int filter_col_idx = 0; filter_col_idx < filter_width;
+                 ++filter_col_idx) {
+              int im_col_offset =
+                  col_col_idx * stride[1] + filter_col_idx - padding[1];
+
+              int col_offset =
+                  ((((col_row_idx)*col_width + col_col_idx) * im_channels +
+                    channel) *
+                       filter_height +
+                   filter_row_idx) *
+                      filter_width +
+                  filter_col_idx;
+
+              int im_offset = (channel * im_height + im_row_offset) * im_width +
+                              im_col_offset;
+              col_data[col_offset] =
+                  (im_row_offset < 0 || im_row_offset >= im_height ||
+                   im_col_offset < 0 || im_col_offset >= im_width)
+                      ? static_cast<T>(0)
+                      : im_data[im_offset];
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [output_height, output_width, input_channels, filter_height, filter_width]
+ */
+template <class T>
+class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                    platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im) {
+    PADDLE_ENFORCE(im->dims().size() == 3);
+    PADDLE_ENFORCE(col.dims().size() == 5);
+    int im_channels = im->dims()[0];
+    int im_height = im->dims()[1];
+    int im_width = im->dims()[2];
+    int filter_height = col.dims()[3];
+    int filter_width = col.dims()[4];
+    int col_height = col.dims()[0];
+    int col_width = col.dims()[1];
+
+    PADDLE_ENFORCE_EQ(
+        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
+        col_height,
+        "Output_height and padding(padding_up, padding_down) are "
+        "inconsistent.");
+    PADDLE_ENFORCE_EQ(
+        (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
+        col_width,
+        "col_width and padding(padding_left, padding_right) are "
+        "inconsistent.");
+
+    T* im_data = im->data<T>();
+    const T* col_data = col.data<T>();
+
+    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
+      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
+        for (int channel = 0; channel < im_channels; ++channel) {
+          for (int filter_row_idx = 0; filter_row_idx < filter_height;
+               ++filter_row_idx) {
+            int im_row_offset =
+                col_row_idx * stride[0] + filter_row_idx - padding[0];
+            for (int filter_col_idx = 0; filter_col_idx < filter_width;
+                 ++filter_col_idx) {
+              int im_col_offset =
+                  col_col_idx * stride[1] + filter_col_idx - padding[1];
+
+              int col_offset =
+                  (((col_row_idx * col_width + col_col_idx) * im_channels +
+                    channel) *
+                       filter_height +
+                   filter_row_idx) *
+                      filter_width +
+                  filter_col_idx;
+
+              if (im_row_offset >= 0 && im_row_offset < im_height &&
+                  im_col_offset >= 0 && im_col_offset < im_width) {
+                int im_offset =
+                    (channel * im_height + im_row_offset) * im_width +
+                    im_col_offset;
+                im_data[im_offset] += col_data[col_offset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CPUDeviceContext, float>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CPUDeviceContext, double>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CPUDeviceContext, float>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c26343aacf524c4381a8bba1e4e0d1a07bee6d6e
--- /dev/null
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -0,0 +1,424 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <class T>
+__global__ void im2col(const T* data_im, int num_outs, int im_height,
+                       int im_width, int dilation_h, int dilation_w,
+                       int filter_height, int filter_width, int stride_height,
+                       int stride_width, int padding_height, int padding_width,
+                       int col_height, int col_width, T* data_col) {
+  const int index =
+      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < num_outs) {
+    int w_out = index % col_width;
+    int h_out = (index / col_width) % col_height;
+    int channel_in = index / col_width / col_height;
+    int channel_out = channel_in * filter_height * filter_width;
+    int h_in = h_out * stride_height - padding_height;
+    int w_in = w_out * stride_width - padding_width;
+
+    data_col += (channel_out * col_height + h_out) * col_width + w_out;
+    data_im += (channel_in * im_height + h_in) * im_width + w_in;
+    for (int i = 0; i < filter_height; ++i) {
+      for (int j = 0; j < filter_width; ++j) {
+        int rIdx = h_in + i * dilation_h;
+        int cIdx = w_in + j * dilation_w;
+        *data_col =
+            (rIdx >= im_height || rIdx < 0 || cIdx >= im_width || cIdx < 0)
+                ? 0
+                : data_im[i * dilation_h * im_width + j * dilation_w];
+        data_col += col_height * col_width;
+      }
+    }
+  }
+}
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [input_channels, filter_height, filter_width, output_height, output_width]
+ */
+template <class T>
+class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                    platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& im, const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col) {
+    PADDLE_ENFORCE(im.dims().size() == 3);
+    PADDLE_ENFORCE(col->dims().size() == 5);
+
+    int im_channels = im.dims()[0];
+    int im_height = im.dims()[1];
+    int im_width = im.dims()[2];
+    int filter_height = col->dims()[1];
+    int filter_width = col->dims()[2];
+    int col_height = col->dims()[3];
+    int col_width = col->dims()[4];
+
+    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+                       (dilation[0] * (filter_height - 1) + 1)) /
+                              stride[0] +
+                          1,
+                      col_height,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+                       (dilation[1] * (filter_width - 1) + 1)) /
+                              stride[1] +
+                          1,
+                      col_width,
+                      "col_width and padding(padding_left, padding_right) are "
+                      "inconsistent.");
+
+    int num_outputs = im_channels * col_height * col_width;
+    int blocks = (num_outputs + 1024 - 1) / 1024;
+    int block_x = 512;
+    int block_y = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(block_x, block_y);
+    im2col<T><<<grid, threads, 0, context.stream()>>>(
+        im.data<T>(), num_outputs, im_height, im_width, dilation[0],
+        dilation[1], filter_height, filter_width, stride[0], stride[1],
+        padding[0], padding[1], col_height, col_width, col->data<T>());
+  }
+};
+
+template <class T>
+__global__ void col2im(int n, const T* data_col, int im_height, int im_width,
+                       int dilation_h, int dilation_w, int filter_height,
+                       int filter_width, int stride_height, int stride_width,
+                       int padding_height, int padding_width, int col_height,
+                       int col_width, T* data_im) {
+  const int index =
+      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  const int d_filter_height = dilation_h * (filter_height - 1) + 1;
+  const int d_filter_width = dilation_w * (filter_width - 1) + 1;
+
+  if (index < n) {
+    T val = 0;
+    int w = index % im_width + padding_width;
+    int h = (index / im_width) % im_height + padding_height;
+    int c = index / (im_width * im_height);
+
+    // compute the start and end of the output
+    int w_col_start =
+        (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1;
+    int w_col_end = min(w / stride_width + 1, col_width);
+    int h_col_start =
+        (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1;
+    int h_col_end = min(h / stride_height + 1, col_height);
+
+    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+        int h_off = (h - h_col * stride_height);
+        int w_off = (w - w_col * stride_width);
+        if (h_off % dilation_h == 0 && w_off % dilation_w == 0) {
+          h_off /= dilation_h;
+          w_off /= dilation_w;
+          int data_col_index =
+              (((c * filter_height + h_off) * filter_width + w_off) *
+                   col_height +
+               h_col) *
+                  col_width +
+              w_col;
+
+          val += data_col[data_col_index];
+        }
+      }
+    }
+    data_im[index] = val;
+  }
+}
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [input_channels, filter_height, filter_width, output_height, output_width]
+ */
+template <class T>
+class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                    platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im) {
+    PADDLE_ENFORCE(im->dims().size() == 3);
+    PADDLE_ENFORCE(col.dims().size() == 5);
+
+    int im_channels = im->dims()[0];
+    int im_height = im->dims()[1];
+    int im_width = im->dims()[2];
+    int filter_height = col.dims()[1];
+    int filter_width = col.dims()[2];
+    int col_height = col.dims()[3];
+    int col_width = col.dims()[4];
+
+    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+                       (dilation[0] * (filter_height - 1) + 1)) /
+                              stride[0] +
+                          1,
+                      col_height,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+                       (dilation[1] * (filter_width - 1) + 1)) /
+                              stride[1] +
+                          1,
+                      col_width,
+                      "col_width and padding(padding_left, padding_right) are "
+                      "inconsistent.");
+
+    size_t num_kernels = im_channels * im_height * im_width;
+
+    size_t blocks = (num_kernels + 1024 - 1) / 1024;
+    size_t block_x = 512;
+    size_t block_y = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(block_x, block_y);
+
+    // To avoid involving atomic operations, we will launch one kernel per
+    // bottom dimension, and then in the kernel add up the top dimensions.
+    col2im<T><<<grid, threads, 0, context.stream()>>>(
+        num_kernels, col.data<T>(), im_height, im_width, dilation[0],
+        dilation[1], filter_height, filter_width, stride[0], stride[1],
+        padding[0], padding[2], col_height, col_width, im->data<T>());
+  }
+};
+
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CUDADeviceContext, float>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CUDADeviceContext, double>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CUDADeviceContext, float>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CUDADeviceContext, double>;
+
+template <class T>
+__global__ void im2colOCF(const T* im_data, int im_channels, int im_height,
+                          int im_width, int filter_height, int filter_width,
+                          int stride_height, int stride_width,
+                          int padding_height, int padding_width, int col_height,
+                          int col_width, T* col_data) {
+  int swid = blockIdx.x;
+  int shid = blockIdx.y;
+  for (int channelid = threadIdx.z; channelid < im_channels;
+       channelid += blockDim.z) {
+    for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
+      for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
+        int width_offset = idx + swid * stride_width - padding_width;
+        int height_offset = idy + shid * stride_height - padding_height;
+        int im_offset = width_offset + height_offset * im_width +
+                        channelid * im_height * im_width;
+
+        int col_offset = idx + idy * filter_width +
+                         channelid * filter_height * filter_width +
+                         (shid * col_width + swid) *
+                             (im_channels * filter_height * filter_width);
+
+        col_data[col_offset] =
+            (height_offset >= im_height || height_offset < 0 ||
+             width_offset >= im_width || width_offset < 0)
+                ? T(0)
+                : im_data[im_offset];
+      }
+    }
+  }
+}
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [output_height, output_width, input_channels, filter_height, filter_width]
+ */
+template <class T>
+class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                    platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& im, const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col) {
+    PADDLE_ENFORCE(im.dims().size() == 3);
+    PADDLE_ENFORCE(col->dims().size() == 5);
+    int im_channels = im.dims()[0];
+    int im_height = im.dims()[1];
+    int im_width = im.dims()[2];
+    int filter_height = col->dims()[3];
+    int filter_width = col->dims()[4];
+    int col_height = col->dims()[0];
+    int col_width = col->dims()[1];
+
+    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+                       (dilation[0] * (filter_height - 1) + 1)) /
+                              stride[0] +
+                          1,
+                      col_height,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+                       (dilation[1] * (filter_width - 1) + 1)) /
+                              stride[1] +
+                          1,
+                      col_width,
+                      "col_width and padding(padding_left, padding_right) are "
+                      "inconsistent.");
+
+    int block_dim_x = 0;
+    int block_dim_y = 0;
+    if (filter_height <= 4 && filter_width <= 4) {
+      block_dim_x = 4;
+      block_dim_y = 4;
+    } else if (filter_height <= 8 && filter_width <= 8) {
+      block_dim_x = 8;
+      block_dim_y = 8;
+    } else if (filter_height <= 16 && filter_width <= 16) {
+      block_dim_x = 16;
+      block_dim_y = 16;
+    } else {
+      block_dim_x = 32;
+      block_dim_y = 32;
+    }
+
+    int block_dim_z = 1024 / block_dim_x / block_dim_y;
+    dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
+    dim3 grid(col_width, col_height);
+    im2colOCF<T><<<grid, threads, 0, context.stream()>>>(
+        im.data<T>(), im_channels, im_height, im_width, filter_height,
+        filter_width, stride[0], stride[1], padding[0], padding[1], col_height,
+        col_width, col->data<T>());
+  }
+};
+
+template <class T>
+__global__ void col2imOCF(const T* col_data, int im_channels, int im_height,
+                          int im_width, int filter_height, int filter_width,
+                          int stride_height, int stride_width,
+                          int padding_height, int padding_width, int col_height,
+                          int col_width, T* im_data) {
+  int swid = blockIdx.x;
+  int shid = blockIdx.y;
+  for (int channelid = threadIdx.z; channelid < im_channels;
+       channelid += blockDim.z) {
+    for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
+      for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
+        int width_offset = idx + swid * stride_width - padding_width;
+        int height_offset = idy + shid * stride_height - padding_height;
+        int im_offset = width_offset + height_offset * im_width +
+                        channelid * im_height * im_width;
+
+        int col_offset = idx + idy * filter_width +
+                         channelid * filter_height * filter_width +
+                         (shid * col_width + swid) *
+                             (im_channels * filter_height * filter_width);
+
+        if (height_offset >= 0 && height_offset < im_height &&
+            width_offset >= 0 && width_offset < im_width) {
+          paddle::platform::CudaAtomicAdd(im_data + im_offset,
+                                          col_data[col_offset]);
+        }
+      }
+    }
+  }
+}
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [output_height, output_width, input_channels, filter_height, filter_width]
+ */
+template <class T>
+class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                    platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im) {
+    PADDLE_ENFORCE(im->dims().size() == 3);
+    PADDLE_ENFORCE(col.dims().size() == 5);
+    int im_channels = im->dims()[0];
+    int im_height = im->dims()[1];
+    int im_width = im->dims()[2];
+    int filter_height = col.dims()[3];
+    int filter_width = col.dims()[4];
+    int col_height = col.dims()[0];
+    int col_width = col.dims()[1];
+
+    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+                       (dilation[0] * (filter_height - 1) + 1)) /
+                              stride[0] +
+                          1,
+                      col_height,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+                       (dilation[1] * (filter_width - 1) + 1)) /
+                              stride[1] +
+                          1,
+                      col_width,
+                      "col_width and padding(padding_left, padding_right) are "
+                      "inconsistent.");
+
+    int block_dim_x = 0;
+    int block_dim_y = 0;
+    if (filter_height <= 4 && filter_width <= 4) {
+      block_dim_x = 4;
+      block_dim_y = 4;
+    } else if (filter_height <= 8 && filter_width <= 8) {
+      block_dim_x = 8;
+      block_dim_y = 8;
+    } else if (filter_height <= 16 && filter_width <= 16) {
+      block_dim_x = 16;
+      block_dim_y = 16;
+    } else {
+      block_dim_x = 32;
+      block_dim_y = 32;
+    }
+
+    int block_dim_z = 1024 / block_dim_x / block_dim_y;
+    dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
+    dim3 grid(col_width, col_height);
+    col2imOCF<T><<<grid, threads, 0, context.stream()>>>(
+        col.data<T>(), im_channels, im_height, im_width, filter_height,
+        filter_width, stride[0], stride[1], padding[0], padding[1], col_height,
+        col_width, im->data<T>());
+  }
+};
+
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CUDADeviceContext, float>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CUDADeviceContext, double>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CUDADeviceContext, float>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/im2col.h b/paddle/fluid/operators/math/im2col.h
new file mode 100644
index 0000000000000000000000000000000000000000..525c0f5dda102ef92a9d79832fecc10f99ccd900
--- /dev/null
+++ b/paddle/fluid/operators/math/im2col.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
+enum class ColFormat { kCFO = 0, kOCF = 1 };
+
+/*
+ * \brief Converts the image data of three dimensions(CHW) into a colData of
+ *        five dimensions in the Im2ColFunctor calculation,
+ *        And in the Col2ImFunctor calculation, it is reversed.
+ *
+ * \param imData   Image data.
+ * \param imShape  The shape of imData,
+ *                 [input_channels, input_height, input_width].
+ * \param colData  Column data.
+ * \param colShape The shape of colData.
+ *
+ * \param dilations    dilation data.
+ * \param 2-dimension  [dilation_height, dilation_width].
+ *
+ * \param strides      stride data.
+ * \param 2-dimension  [stride_height, stride_width].
+ *
+ * \param paddings     padding data.
+ * \param 4-dimension  [up_pad, left_pad, down_pad, right_pad].
+ *
+ * If the template argument Format is kCFO, the shape of colData is:
+ * [input_channels, filter_height, filter_width, output_height, output_width]
+ * So, it is easy to reshape into a convolution matrix for convolution
+ * calculation based on matrix multiplication.
+ * The shape of convolution matrix is [height, width], where the height is equal
+ * input_channels * filter_height * filter_width, and the width is equal
+ * output_height * output_width.
+ *
+ * Reshape:
+ *     shape of colData           shape of convolution matrix
+ *     [input_channels,
+ *      filter_height,
+ *      filter_width,      ======>      [height, width]
+ *      output_height,
+ *      output_width]
+ *
+ * If the template argument Format is kOCF, the shape of colData is:
+ * [output_height, output_width, input_channels, filter_height, filter_width]
+ * So, it is easy to reshape into a sequence matrix for rnn calculation.
+ * The shape of sequence matrix is [seq_length, step_size], where the seq_length
+ * is equal output_height * output_width, and the step_size is equal
+ * input_channels * filter_height * filter_width.
+ *
+ * Reshape:
+ *     shape of colData             shape of sequence matrix
+ *     [output_height,
+ *      output_width,
+ *      input_channels,    ======>    [seqLength, stepSize]
+ *      filter_height,
+ *      filter_width]
+ *
+ * \note The caller needs to ensure that imShape.inputChannels is equal to
+ *       colShape.inputChannels.
+ */
+template <ColFormat Format, typename DeviceContext, typename T>
+class Im2ColFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& im,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col);
+};
+
+template <ColFormat Format, typename DeviceContext, typename T>
+class Col2ImFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..59d6a84b892fc32aceb5622a856a5c648a3ade5b
--- /dev/null
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -0,0 +1,168 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/im2col.h"
+#include <gtest/gtest.h>
+
+template <typename DeviceContext, typename Place>
+void testIm2col() {
+  paddle::framework::Tensor input_tmp;
+  paddle::framework::Tensor input;
+  paddle::framework::Tensor output_cfo;
+  paddle::framework::Tensor output_ocf;
+  paddle::framework::Tensor output_tmp;
+
+  /**
+   * input = [0, 1, 2,
+   *          3, 4, 5]
+   *
+   * output_cfo = [0, 1
+   *               1, 2
+   *               3, 4
+   *               4, 5]
+   *
+   * output_ocf = [0, 1, 3, 4
+   *               1, 2, 4, 5]
+   *
+   * col2im_cfo = [0, 2, 2
+   *               3, 4, 5]
+   *
+   * col2im_ocf = [0, 2, 2
+   *               3, 4, 5]
+   */
+  int input_height = 2;
+  int input_width = 3;
+  int filter_size = 2;
+  std::vector<int> stride({1, 1});  // stride_y, stride_x
+  std::vector<int> padding(
+      {0, 0, 0, 0});                  // up_pad, left_pad, down_pad, right_pad
+  std::vector<int> dilation({1, 1});  // dilation_y, dilation_x
+  int output_height =
+      (input_height - filter_size + padding[0] + padding[1]) / stride[0] + 1;
+  int output_width =
+      (input_width - filter_size + padding[2] + padding[3]) / stride[1] + 1;
+  float* input_ptr = input_tmp.mutable_data<float>(
+      {1, input_height, input_width}, paddle::platform::CPUPlace());
+  float arr[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input_ptr, arr, 6 * sizeof(float));
+
+  auto* place = new Place();
+  DeviceContext* context = new DeviceContext(*place);
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    Copy(input_tmp, *place, *context, &input);
+  }
+  output_cfo.mutable_data<float>(
+      {1, filter_size, filter_size, output_height, output_width}, *place);
+  output_ocf.mutable_data<float>(
+      {output_height, output_width, 1, filter_size, filter_size}, *place);
+
+  // Im2Col
+  paddle::operators::math::Im2ColFunctor<
+      paddle::operators::math::ColFormat::kCFO, DeviceContext, float>
+      im2col;
+  paddle::operators::math::Im2ColFunctor<
+      paddle::operators::math::ColFormat::kOCF, DeviceContext, float>
+      im2col_ocf;
+
+  im2col(*context, input, dilation, stride, padding, &output_cfo);
+  im2col_ocf(*context, input, dilation, stride, padding, &output_ocf);
+
+  float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5};
+  float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5};
+
+  float* out_cfo_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_cfo_ptr = output_cfo.data<float>();
+  } else {
+    Copy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp);
+    out_cfo_ptr = output_tmp.data<float>();
+  }
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]);
+  }
+
+  float* out_ocf_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_ocf_ptr = output_ocf.data<float>();
+  } else {
+    Copy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp);
+    out_ocf_ptr = output_tmp.data<float>();
+  }
+
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]);
+  }
+
+  // Col2Im: kCFO
+  paddle::operators::math::Col2ImFunctor<
+      paddle::operators::math::ColFormat::kCFO, DeviceContext, float>
+      col2im;
+  paddle::operators::math::Col2ImFunctor<
+      paddle::operators::math::ColFormat::kOCF, DeviceContext, float>
+      col2im_ocf;
+  float col2im_data[] = {0, 2, 2, 3, 8, 5};
+
+  memset(input_ptr, 0, 6 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    Copy(input_tmp, *place, *context, &input);
+  }
+
+  col2im(*context, output_cfo, dilation, stride, padding, &input);
+
+  float* in_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
+    in_ptr = input_tmp.data<float>();
+  }
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(in_ptr[i], col2im_data[i]);
+  }
+
+  // Col2Im: kOCF
+  memset(input_ptr, 0, 6 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    Copy(input_tmp, *place, *context, &input);
+  }
+
+  col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
+    in_ptr = input_tmp.data<float>();
+  }
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(in_ptr[i], col2im_data[i]);
+  }
+
+  delete place;
+  delete context;
+}
+
+TEST(math, im2col) {
+  testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
+#ifdef PADDLE_WITH_CUDA
+  testIm2col<paddle::platform::CUDADeviceContext,
+             paddle::platform::CUDAPlace>();
+#endif
+}
diff --git a/paddle/fluid/operators/math/lstm_compute.cc b/paddle/fluid/operators/math/lstm_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..09eb89ec58d107f547a5c83908a9d2a541aa95f4
--- /dev/null
+++ b/paddle/fluid/operators/math/lstm_compute.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/lstm_compute.h"
+#include "paddle/fluid/operators/math/detail/lstm_cpu_kernel.h"
+#include "paddle/fluid/operators/math/detail/lstm_kernel.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <class T>
+struct LstmUnitFunctor<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext& context,
+                      LstmMetaValue<T> value, int frame_size, int batch_size,
+                      const detail::ActivationType& gate_act,
+                      const detail::ActivationType& cell_act,
+                      const detail::ActivationType& cand_act) {
+    for (int b = 0; b < batch_size; b++) {
+      detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
+                               cand_act, gate_act, cell_act);
+      value.gate_value += frame_size * 4;
+      value.state_value += frame_size;
+      value.state_active_value += frame_size;
+      value.output_value += frame_size;
+      if (value.prev_state_value) {
+        value.prev_state_value += frame_size;
+      }
+    }
+  }
+};
+
+template <class T>
+struct LstmUnitGradFunctor<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext& context,
+                      LstmMetaValue<T> value, LstmMetaGrad<T> grad,
+                      int frame_size, int batch_size,
+                      const detail::ActivationType& gate_act,
+                      const detail::ActivationType& cell_act,
+                      const detail::ActivationType& cand_act) {
+    for (int b = 0; b < batch_size; b++) {
+      detail::cpu_lstm_backward(detail::backward::lstm<T>(), value, grad,
+                                frame_size, cand_act, gate_act, cell_act);
+
+      value.gate_value += frame_size * 4;
+      value.state_value += frame_size;
+      value.state_active_value += frame_size;
+      value.output_value += frame_size;
+      if (value.prev_state_value) {
+        value.prev_state_value += frame_size;
+      }
+
+      grad.gate_grad += frame_size * 4;
+      grad.state_grad += frame_size;
+      grad.state_active_grad += frame_size;
+      grad.output_grad += frame_size;
+      if (grad.prev_state_grad) {
+        grad.prev_state_grad += frame_size;
+      }
+    }
+  }
+};
+
+template class LstmUnitFunctor<platform::CPUDeviceContext, float>;
+template class LstmUnitFunctor<platform::CPUDeviceContext, double>;
+template class LstmUnitGradFunctor<platform::CPUDeviceContext, float>;
+template class LstmUnitGradFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/lstm_compute.cu b/paddle/fluid/operators/math/lstm_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..adedee28bd010c611b9c6901c55bf67e73d3639b
--- /dev/null
+++ b/paddle/fluid/operators/math/lstm_compute.cu
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/detail/lstm_gpu_kernel.h"
+#include "paddle/fluid/operators/math/detail/lstm_kernel.h"
+#include "paddle/fluid/operators/math/lstm_compute.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <class T>
+struct LstmUnitFunctor<platform::CUDADeviceContext, T> {
+  static void compute(const platform::CUDADeviceContext& context,
+                      LstmMetaValue<T> value, int frame_size, int batch_size,
+                      const detail::ActivationType& gate_act,
+                      const detail::ActivationType& cell_act,
+                      const detail::ActivationType& cand_act) {
+    detail::gpu_lstm_forward<T>(context, detail::forward::lstm<T>(), value,
+                                frame_size, batch_size, cand_act, gate_act,
+                                cell_act);
+  }
+};
+
+template <class T>
+struct LstmUnitGradFunctor<platform::CUDADeviceContext, T> {
+  static void compute(const platform::CUDADeviceContext& context,
+                      LstmMetaValue<T> value, LstmMetaGrad<T> grad,
+                      int frame_size, int batch_size,
+                      const detail::ActivationType& gate_act,
+                      const detail::ActivationType& cell_act,
+                      const detail::ActivationType& cand_act) {
+    detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, grad,
+                              frame_size, batch_size, cand_act, gate_act,
+                              cell_act);
+  }
+};
+
+template class LstmUnitFunctor<platform::CUDADeviceContext, float>;
+template class LstmUnitFunctor<platform::CUDADeviceContext, double>;
+template class LstmUnitGradFunctor<platform::CUDADeviceContext, float>;
+template class LstmUnitGradFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/lstm_compute.h b/paddle/fluid/operators/math/lstm_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..8610e96cf1abb0e5a64cd60c6bab3a8c08754587
--- /dev/null
+++ b/paddle/fluid/operators/math/lstm_compute.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <class T>
+struct LstmMetaValue {
+  T *gate_value;
+  T *prev_state_value;
+  T *state_value;
+  T *state_active_value;
+  T *output_value;
+  T *check_ig;
+  T *check_fg;
+  T *check_og;
+};
+
+template <class T>
+struct LstmMetaGrad {
+  T *gate_grad;
+  T *prev_state_grad;
+  T *state_grad;
+  T *state_active_grad;
+  T *output_grad;
+  T *check_ig_grad;
+  T *check_fg_grad;
+  T *check_og_grad;
+};
+
+template <typename DeviceContext, typename T>
+class LstmUnitFunctor {
+ public:
+  static void compute(const DeviceContext &context, LstmMetaValue<T> value,
+                      int frame_size, int batch_size,
+                      const detail::ActivationType &gate_act,
+                      const detail::ActivationType &cell_act,
+                      const detail::ActivationType &cand_act);
+};
+
+template <typename DeviceContext, typename T>
+class LstmUnitGradFunctor {
+ public:
+  static void compute(const DeviceContext &context, LstmMetaValue<T> value,
+                      LstmMetaGrad<T> grad, int frame_size, int batch_size,
+                      const detail::ActivationType &gate_act,
+                      const detail::ActivationType &cell_act,
+                      const detail::ActivationType &cand_act);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2636dbddde67955a99663aa93df1425b9e1ec2ce
--- /dev/null
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -0,0 +1,342 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/math/math_function_impl.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <>
+void gemm<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C) {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+              beta, C, ldc);
+}
+
+template <>
+void gemm<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C) {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+              beta, C, ldc);
+}
+
+template <>
+void gemm<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K, const float alpha,
+    const float* A, const int lda, const float* B, const int ldb,
+    const float beta, float* C, const int ldc) {
+  cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+              lda, B, ldb, beta, C, ldc);
+}
+
+template <>
+void gemm<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const int lda, const double* B,
+    const int ldb, const double beta, double* C, const int ldc) {
+  cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+              lda, B, ldb, beta, C, ldc);
+}
+
+template <>
+void matmul<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, float alpha,
+    framework::Tensor* matrix_out, float beta) {
+  auto dim_a = matrix_a.dims();
+  auto dim_b = matrix_b.dims();
+  auto dim_out = matrix_out->dims();
+  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+                 "The input and output of matmul be matrix");
+
+  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
+                     platform::is_cpu_place(matrix_b.place()) &&
+                     platform::is_cpu_place(matrix_out->place()),
+                 "Matrix must all be in CPUPlace");
+
+  int M = dim_out[0];
+  int N = dim_out[1];
+  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+
+  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
+
+  gemm<platform::CPUDeviceContext, float>(
+      context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
+      matrix_b.data<float>(), beta, matrix_out->data<float>());
+}
+
+template <>
+void matmul<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, double alpha,
+    framework::Tensor* matrix_out, double beta) {
+  auto dim_a = matrix_a.dims();
+  auto dim_b = matrix_b.dims();
+  auto dim_out = matrix_out->dims();
+  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+                 "The input and output of matmul be matrix");
+
+  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
+                     platform::is_cpu_place(matrix_b.place()) &&
+                     platform::is_cpu_place(matrix_out->place()),
+                 "Matrix must all be in CPUPlace");
+
+  int M = dim_out[0];
+  int N = dim_out[1];
+  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+
+  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
+
+  gemm<platform::CPUDeviceContext, double>(
+      context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
+      matrix_b.data<double>(), beta, matrix_out->data<double>());
+}
+
+#ifdef PADDLE_WITH_MKLML
+// Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize.
+template <>
+void batched_gemm<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C, const int batchCount, const int strideA, const int strideB) {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  auto a_array = std::vector<const float*>(batchCount);
+  auto b_array = std::vector<const float*>(batchCount);
+  auto c_array = std::vector<float*>(batchCount);
+  for (int k = 0; k < batchCount; ++k) {
+    a_array[k] = &A[k * strideA];
+    b_array[k] = &B[k * strideB];
+    c_array[k] = &C[k * M * N];
+  }
+  cblas_sgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha,
+                    a_array.data(), &lda, b_array.data(), &ldb, &beta,
+                    c_array.data(), &ldc, 1 /* group_count */, &batchCount);
+}
+
+template <>
+void batched_gemm<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C, const int batchCount, const int strideA, const int strideB) {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  auto a_array = std::vector<const double*>(batchCount);
+  auto b_array = std::vector<const double*>(batchCount);
+  auto c_array = std::vector<double*>(batchCount);
+  for (int k = 0; k < batchCount; ++k) {
+    a_array[k] = &A[k * strideA];
+    b_array[k] = &B[k * strideB];
+    c_array[k] = &C[k * M * N];
+  }
+  cblas_dgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha,
+                    a_array.data(), &lda, b_array.data(), &ldb, &beta,
+                    c_array.data(), &ldc, 1 /* group_count */, &batchCount);
+}
+#else
+// The below is a naive but correct serial implementation that just loops
+// over the batch dimension. This is a fallback for when the batched gemm
+// functions of Intel MKL are not available. In the future, this computation
+// should be parallelized.
+template <>
+void batched_gemm<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C, const int batchCount, const int strideA, const int strideB) {
+  for (int k = 0; k < batchCount; ++k) {
+    const float* Ak = &A[k * strideA];
+    const float* Bk = &B[k * strideB];
+    float* Ck = &C[k * M * N];
+    gemm<platform::CPUDeviceContext, float>(context, transA, transB, M, N, K,
+                                            alpha, Ak, Bk, beta, Ck);
+  }
+}
+
+template <>
+void batched_gemm<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C, const int batchCount, const int strideA, const int strideB) {
+  for (int k = 0; k < batchCount; ++k) {
+    const double* Ak = &A[k * strideA];
+    const double* Bk = &B[k * strideB];
+    double* Ck = &C[k * M * N];
+    gemm<platform::CPUDeviceContext, double>(context, transA, transB, M, N, K,
+                                             alpha, Ak, Bk, beta, Ck);
+  }
+}
+#endif
+
+template <>
+void gemv<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const bool trans_a, const int M,
+    const int N, const float alpha, const float* A, const float* B,
+    const float beta, float* C) {
+  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+  cblas_sgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
+}
+
+template <>
+void gemv<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const bool trans_a, const int M,
+    const int N, const double alpha, const double* A, const double* B,
+    const double beta, double* C) {
+  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+  cblas_dgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
+}
+
+template <>
+void axpy<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const int n, const float alpha,
+    const float* x, float* y) {
+  cblas_saxpy(n, alpha, x, 1, y, 1);
+}
+
+template <>
+void axpy<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const int n, const double alpha,
+    const double* x, double* y) {
+  cblas_daxpy(n, alpha, x, 1, y, 1);
+}
+
+template struct SetConstant<platform::CPUDeviceContext, float>;
+template struct SetConstant<platform::CPUDeviceContext, double>;
+template struct SetConstant<platform::CPUDeviceContext, int>;
+template struct SetConstant<platform::CPUDeviceContext, int64_t>;
+template struct SetConstant<platform::CPUDeviceContext, bool>;
+
+#define DEFINE_CPU_TRANS(RANK)                                          \
+  template struct Transpose<platform::CPUDeviceContext, float, RANK>;   \
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;  \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;
+
+DEFINE_CPU_TRANS(1);
+DEFINE_CPU_TRANS(2);
+DEFINE_CPU_TRANS(3);
+DEFINE_CPU_TRANS(4);
+DEFINE_CPU_TRANS(5);
+DEFINE_CPU_TRANS(6);
+
+struct TensorSetConstantCPU {
+  TensorSetConstantCPU(framework::Tensor* tensor, float value)
+      : tensor_(tensor), value_(value) {}
+  template <typename T>
+  void operator()() const {
+    auto cpu = platform::CPUPlace();
+    auto* begin = tensor_->mutable_data<T>(cpu);
+    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
+  }
+  framework::Tensor* tensor_;
+  float value_;
+};
+
+template <>
+void set_constant_with_place<platform::CPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  framework::VisitDataType(framework::ToDataType(tensor->type()),
+                           TensorSetConstantCPU(tensor, value));
+}
+
+struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
+  TensorSetConstantWithPlace(const platform::DeviceContext& context,
+                             framework::Tensor* tensor, float value)
+      : context_(context), tensor_(tensor), value_(value) {}
+
+  template <typename Place>
+  void operator()(Place place) const {
+    set_constant_with_place<Place>(context_, tensor_, value_);
+  }
+
+  const platform::DeviceContext& context_;
+  framework::Tensor* tensor_;
+  float value_;
+};
+
+void set_constant(const platform::DeviceContext& context,
+                  framework::Tensor* tensor, float value) {
+  TensorSetConstantWithPlace func(context, tensor, value);
+#ifdef PADDLE_WITH_CUDA
+  tensor->place().apply_visitor(func);
+#else
+  func(platform::CPUPlace());
+#endif
+}
+
+template <typename T>
+struct RowwiseAdd<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& vector, framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(vector.numel(), size);
+    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+
+    auto in = framework::EigenMatrix<T>::From(input);
+    auto vec = framework::EigenVector<T>::Flatten(vector);
+    auto out = framework::EigenMatrix<T>::From(*output);
+
+    for (int64_t i = 0; i < in_dims[0]; ++i) {
+      out.chip(i, 0) = in.chip(i, 0) + vec;
+    }
+  }
+};
+
+template struct RowwiseAdd<platform::CPUDeviceContext, float>;
+template struct RowwiseAdd<platform::CPUDeviceContext, double>;
+
+template struct ColwiseSum<platform::CPUDeviceContext, float>;
+template struct ColwiseSum<platform::CPUDeviceContext, double>;
+
+template struct RowwiseSum<platform::CPUDeviceContext, float>;
+template struct RowwiseSum<platform::CPUDeviceContext, double>;
+
+template struct RowwiseMean<platform::CPUDeviceContext, float>;
+template struct RowwiseMean<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5764da71c8491e27493774569bb663f6a6e835c3
--- /dev/null
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -0,0 +1,355 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/math_function_impl.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <>
+void gemm<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  PADDLE_ENFORCE(platform::dynload::cublasSgemm(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+      lda, &beta, C, N));
+}
+
+template <>
+void gemm<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  PADDLE_ENFORCE(platform::dynload::cublasDgemm(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+      lda, &beta, C, N));
+}
+
+template <>
+void gemm<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K, const float alpha,
+    const float* A, const int lda, const float* B, const int ldb,
+    const float beta, float* C, const int ldc) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  PADDLE_ENFORCE(platform::dynload::cublasSgemm(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+      lda, &beta, C, ldc));
+}
+
+template <>
+void gemm<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const int lda, const double* B,
+    const int ldb, const double beta, double* C, const int ldc) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  PADDLE_ENFORCE(platform::dynload::cublasDgemm(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+      lda, &beta, C, ldc));
+}
+
+template <>
+void matmul<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, float alpha,
+    framework::Tensor* matrix_out, float beta) {
+  auto dim_a = matrix_a.dims();
+  auto dim_b = matrix_b.dims();
+  auto dim_out = matrix_out->dims();
+  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+                 "The input and output of matmul be matrix");
+
+  PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
+                     platform::is_gpu_place(matrix_b.place()) &&
+                     platform::is_gpu_place(matrix_out->place()),
+                 "Matrix must all be in CUDAPlace");
+
+  int M = dim_out[0];
+  int N = dim_out[1];
+  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+
+  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
+
+  gemm<platform::CUDADeviceContext, float>(
+      context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
+      matrix_b.data<float>(), beta, matrix_out->data<float>());
+}
+
+template <>
+void matmul<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, double alpha,
+    framework::Tensor* matrix_out, double beta) {
+  auto dim_a = matrix_a.dims();
+  auto dim_b = matrix_b.dims();
+  auto dim_out = matrix_out->dims();
+  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+                 "The input and output of matmul be matrix");
+
+  PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
+                     platform::is_gpu_place(matrix_b.place()) &&
+                     platform::is_gpu_place(matrix_out->place()),
+                 "Matrix must all be in CUDAPlace");
+
+  int M = dim_out[0];
+  int N = dim_out[1];
+  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+
+  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
+
+  gemm<platform::CUDADeviceContext, double>(
+      context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
+      matrix_b.data<double>(), beta, matrix_out->data<double>());
+}
+
+template <>
+void batched_gemm<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C, const int batchCount, const int strideA, const int strideB) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  const int strideC = M * N;
+
+  PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
+      strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
+}
+
+template <>
+void batched_gemm<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C, const int batchCount, const int strideA, const int strideB) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  const int strideC = M * N;
+
+  PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
+      strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
+}
+
+template <>
+void gemv<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const bool trans_a, const int M,
+    const int N, const float alpha, const float* A, const float* B,
+    const float beta, float* C) {
+  cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  PADDLE_ENFORCE(platform::dynload::cublasSgemv(context.cublas_handle(),
+                                                cuTransA, N, M, &alpha, A, N, B,
+                                                1, &beta, C, 1));
+}
+
+template <>
+void gemv<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const bool trans_a, const int M,
+    const int N, const double alpha, const double* A, const double* B,
+    const double beta, double* C) {
+  cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
+  PADDLE_ENFORCE(platform::dynload::cublasDgemv(context.cublas_handle(),
+                                                cuTransA, N, M, &alpha, A, N, B,
+                                                1, &beta, C, 1));
+}
+
+template <>
+void axpy<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const int n, const float alpha,
+    const float* x, float* y) {
+  PADDLE_ENFORCE(platform::dynload::cublasSaxpy(context.cublas_handle(), n,
+                                                &alpha, x, 1, y, 1));
+}
+
+template <>
+void axpy<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const int n, const double alpha,
+    const double* x, double* y) {
+  PADDLE_ENFORCE(platform::dynload::cublasDaxpy(context.cublas_handle(), n,
+                                                &alpha, x, 1, y, 1));
+}
+
+template struct SetConstant<platform::CUDADeviceContext, float>;
+template struct SetConstant<platform::CUDADeviceContext, double>;
+template struct SetConstant<platform::CUDADeviceContext, int>;
+template struct SetConstant<platform::CUDADeviceContext, int64_t>;
+template struct SetConstant<platform::CUDADeviceContext, bool>;
+
+#define DEFINE_GPU_TRANS(RANK)                                         \
+  template struct Transpose<platform::CUDADeviceContext, float, RANK>; \
+  template struct Transpose<platform::CUDADeviceContext, double, RANK>;
+
+DEFINE_GPU_TRANS(1);
+DEFINE_GPU_TRANS(2);
+DEFINE_GPU_TRANS(3);
+DEFINE_GPU_TRANS(4);
+DEFINE_GPU_TRANS(5);
+DEFINE_GPU_TRANS(6);
+
+struct TensorSetConstantGPU {
+  TensorSetConstantGPU(const platform::DeviceContext& context,
+                       framework::Tensor* tensor, float value)
+      : context_(context), tensor_(tensor), value_(value) {}
+
+  template <typename T>
+  void operator()() const {
+    SetConstant<platform::CUDADeviceContext, T> functor;
+    functor(reinterpret_cast<const platform::CUDADeviceContext&>(context_),
+            tensor_, static_cast<T>(value_));
+  }
+
+  const platform::DeviceContext& context_;
+  framework::Tensor* tensor_;
+  float value_;
+};
+
+template <>
+void set_constant_with_place<platform::CUDAPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  framework::VisitDataType(framework::ToDataType(tensor->type()),
+                           TensorSetConstantGPU(context, tensor, value));
+}
+
+template <typename T>
+__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width,
+                                 int num) {
+  T tmp = 1.0 / width;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    int h = i * tmp;
+    int w = i - h * width;
+    c[i] = a[i] + b[w];
+  }
+}
+
+template <typename T>
+struct RowwiseAdd<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& vector, framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(vector.numel(), size);
+    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+    int blocks = 512;
+    int grids = (input.numel() + blocks - 1) / blocks;
+    RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
+        input.data<T>(), vector.data<T>(), output->data<T>(),
+        static_cast<int>(in_dims[1]), static_cast<int>(input.numel()));
+  }
+};
+
+template struct RowwiseAdd<platform::CUDADeviceContext, float>;
+template struct RowwiseAdd<platform::CUDADeviceContext, double>;
+template struct ColwiseSum<platform::CUDADeviceContext, float>;
+// template struct ColwiseSum<platform::CUDADeviceContext, double>;
+// The ColwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
+// and only failed for this case. So reimplemented it.
+template <>
+void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
+    const platform::CUDADeviceContext& context, const framework::Tensor& input,
+    framework::Tensor* vector) {
+  auto in_dims = input.dims();
+  auto size = input.numel() / in_dims[0];
+  PADDLE_ENFORCE_EQ(vector->numel(), size);
+  framework::Tensor one;
+  one.mutable_data<double>({in_dims[0]}, context.GetPlace());
+  SetConstant<platform::CUDADeviceContext, double> set;
+  set(context, &one, static_cast<double>(1.0));
+  gemv<platform::CUDADeviceContext, double>(
+      context, true, static_cast<int>(in_dims[0]), static_cast<int>(in_dims[1]),
+      1.0, input.data<double>(), one.data<double>(), 0.0,
+      vector->data<double>());
+}
+
+template struct RowwiseSum<platform::CUDADeviceContext, float>;
+// template struct RowwiseSum<platform::CUDADeviceContext, double>;
+// TODO(zcd): Following ColwiseSum format, need to confirm.
+// The RowwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
+// and only failed for this case. So reimplemented it.
+template <>
+void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
+    const platform::CUDADeviceContext& context, const framework::Tensor& input,
+    framework::Tensor* vector) {
+  auto in_dims = input.dims();
+  auto size = input.numel() / in_dims[0];
+  PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0]);
+  framework::Tensor one;
+  one.mutable_data<double>({size}, context.GetPlace());
+  SetConstant<platform::CUDADeviceContext, double> set;
+  set(context, &one, static_cast<double>(1.0));
+  gemv<platform::CUDADeviceContext, double>(
+      context, true, static_cast<int>(in_dims[1]), static_cast<int>(in_dims[0]),
+      1.0, one.data<double>(), input.data<double>(), 0.0,
+      vector->data<double>());
+}
+
+template struct RowwiseMean<platform::CUDADeviceContext, float>;
+template struct RowwiseMean<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..84916af1f8e5bd55920685b4897858f2da9fde92
--- /dev/null
+++ b/paddle/fluid/operators/math/math_function.h
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_MKLML
+#include <mkl_cblas.h>
+#include <mkl_lapacke.h>
+#include <mkl_vml_functions.h>
+#endif
+
+#ifdef PADDLE_USE_ATLAS
+extern "C" {
+#include <cblas.h>
+#include <clapack.h>
+}
+#endif
+
+#ifdef PADDLE_USE_OPENBLAS
+#include <cblas.h>
+#include <lapacke.h>
+#endif
+
+#ifndef LAPACK_FOUND
+extern "C" {
+#include <cblas.h>
+int LAPACKE_sgetrf(int matrix_layout, int m, int n, float* a, int lda,
+                   int* ipiv);
+int LAPACKE_dgetrf(int matrix_layout, int m, int n, double* a, int lda,
+                   int* ipiv);
+int LAPACKE_sgetri(int matrix_layout, int n, float* a, int lda,
+                   const int* ipiv);
+int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
+                   const int* ipiv);
+}
+#endif
+
+#include <cmath>
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// Support continuous memory now
+// If transA = N, and transB = N
+// Then matrixA: M * K, matrixB: K * N, matrixC : M * N
+// For more detailed info, please refer to
+// http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
+template <typename DeviceContext, typename T>
+void gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA,
+          const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+          const T alpha, const T* A, const T* B, const T beta, T* C);
+
+// gemm wrapper with stride args for matrix uncontinuous in memory
+template <typename DeviceContext, typename T>
+void gemm(const DeviceContext& context, const bool transA, const bool transB,
+          const int M, const int N, const int K, const T alpha, const T* A,
+          const int lda, const T* B, const int ldb, const T beta, T* C,
+          const int ldc);
+
+// matrix multiply with continuous memory
+template <typename DeviceContext, typename T>
+void matmul(const DeviceContext& context, const framework::Tensor& matrix_a,
+            bool trans_a, const framework::Tensor& matrix_b, bool trans_b,
+            T alpha, framework::Tensor* matrix_out, T beta);
+
+// Batched gemm
+template <typename DeviceContext, typename T>
+void batched_gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA,
+                  const CBLAS_TRANSPOSE transB, const int M, const int N,
+                  const int K, const T alpha, const T* A, const T* B,
+                  const T beta, T* C, const int batchCount, const int strideA,
+                  const int strideB);
+
+template <typename DeviceContext, typename T>
+void gemv(const DeviceContext& context, const bool trans_a, const int M,
+          const int N, const T alpha, const T* A, const T* B, const T beta,
+          T* C);
+
+template <typename DeviceContext, typename T>
+void axpy(const DeviceContext& context, const int n, const T alpha, const T* x,
+          T* y);
+
+template <typename DeviceContext, typename T, int Rank>
+struct Transpose {
+  void operator()(const DeviceContext& context, const framework::Tensor& in,
+                  framework::Tensor* out, const std::vector<int>& axis);
+};
+
+template <typename DeviceContext, typename T>
+struct SetConstant {
+  void operator()(const DeviceContext& context, framework::Tensor* tensor,
+                  T num);
+};
+
+template <typename Place>
+void set_constant_with_place(const platform::DeviceContext& context,
+                             framework::Tensor* tensor, float value);
+
+void set_constant(const platform::DeviceContext& context,
+                  framework::Tensor* tensor, float value);
+
+template <typename DeviceContext, typename T>
+struct RowwiseAdd {
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& vec, framework::Tensor* output);
+};
+
+template <typename DeviceContext, typename T>
+struct ColwiseSum {
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* vec);
+};
+
+template <typename DeviceContext, typename T>
+struct RowwiseSum {
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* vec);
+};
+
+template <typename DeviceContext, typename T>
+struct RowwiseMean {
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* vec);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a55ed6c58bafafed1d58ff47fb433a9ae58b8261
--- /dev/null
+++ b/paddle/fluid/operators/math/math_function_impl.h
@@ -0,0 +1,174 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename DeviceContext, typename T>
+void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
+                                               framework::Tensor* tensor,
+                                               T num) {
+  auto t = framework::EigenVector<T>::Flatten(*tensor);
+  t.device(*context.eigen_device()) = t.constant(static_cast<T>(num));
+}
+
+template <typename DeviceContext, typename T, int Rank>
+void Transpose<DeviceContext, T, Rank>::operator()(
+    const DeviceContext& context, const framework::Tensor& in,
+    framework::Tensor* out, const std::vector<int>& axis) {
+  Eigen::array<int, Rank> permute;
+  for (int i = 0; i < Rank; i++) {
+    permute[i] = axis[i];
+  }
+  auto in_dim = in.dims();
+  auto out_dim = out->dims();
+
+  auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
+  auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
+  auto* dev = context.eigen_device();
+  eigen_out.device(*dev) = eigen_in.shuffle(permute);
+}
+
+template <typename DeviceContext, typename T>
+void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
+                                              const framework::Tensor& input,
+                                              framework::Tensor* out) {
+  auto in_dims = input.dims();
+  auto size = input.numel() / in_dims[0];
+  PADDLE_ENFORCE_EQ(out->numel(), size);
+
+  auto in = framework::EigenMatrix<T>::From(input);
+  auto vec = framework::EigenVector<T>::Flatten(*out);
+
+  vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
+}
+
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// colwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class ColwiseSum<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    auto& in_dims = input.dims();
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    PADDLE_ENFORCE_EQ(out->numel(), size);
+
+    T* out_buf = out->mutable_data<T>(out->place());
+    const T* in_buf = input.data<T>();
+
+    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
+      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
+        if (i == 0) {
+          out_buf[j] = in_buf[i * size + j];
+        } else {
+          out_buf[j] += in_buf[i * size + j];
+        }
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
+                                               const framework::Tensor& input,
+                                               framework::Tensor* out) {
+  auto in_dims = input.dims();
+  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+
+  auto in = framework::EigenMatrix<T>::From(input);
+  auto vec = framework::EigenVector<T>::Flatten(*out);
+
+  vec.device(*context.eigen_device()) = in.mean(Eigen::array<int, 1>({{1}}));
+}
+// TODO(zcd): Following ColwiseSum format, need to confirm.
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// rowwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class RowwiseMean<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    auto& in_dims = input.dims();
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    PADDLE_ENFORCE_EQ(out->numel(), height);
+    auto inv_size = 1.0 / size;
+    T* out_buf = out->mutable_data<T>(out->place());
+    const T* in_buf = input.data<T>();
+
+    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
+      T sum = 0;
+      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
+        sum += in_buf[i * size + j];
+      }
+      out_buf[i] = sum * inv_size;
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
+                                              const framework::Tensor& input,
+                                              framework::Tensor* out) {
+  auto in_dims = input.dims();
+  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+
+  auto in = framework::EigenMatrix<T>::From(input);
+  auto vec = framework::EigenVector<T>::Flatten(*out);
+
+  vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{1}}));
+}
+// TODO(zcd): Following ColwiseSum format, need to confirm.
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// rowwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class RowwiseSum<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    auto& in_dims = input.dims();
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    PADDLE_ENFORCE_EQ(out->numel(), size);
+
+    T* out_buf = out->mutable_data<T>(out->place());
+    const T* in_buf = input.data<T>();
+
+    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
+      T sum = 0;
+      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
+        sum += in_buf[i * size + j];
+      }
+      out_buf[i] = sum;
+    }
+  }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6cd8e8b35abeb6d321bb075d7b88f0abd8f0fc59
--- /dev/null
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -0,0 +1,167 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/math/math_function.h"
+#include "gtest/gtest.h"
+
+TEST(math_function, gemm_notrans_cblas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
+  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  paddle::operators::math::gemm<paddle::platform::CPUDeviceContext, float>(
+      context, false, false, m, n, k, 1, input1_ptr, 3, input2_ptr + 1, 4, 1,
+      input3_ptr + 1, 4);
+
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+}
+
+TEST(math_function, gemm_trans_clbas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
+  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  paddle::operators::math::gemm<paddle::platform::CPUDeviceContext, float>(
+      context, false, true, m, n, k, 1, input1_ptr, 3, input2_ptr + 3, 3, 1,
+      input3_ptr + 1, 4);
+
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+}
+
+TEST(math_function, zero) {
+  paddle::framework::Tensor tensor;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* t = tensor.mutable_data<float>({2, 2}, *cpu_place);
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
+                                       float>
+      functor;
+  functor(context, &tensor, 0);
+  EXPECT_EQ(t[0], 0);
+  EXPECT_EQ(t[1], 0);
+  EXPECT_EQ(t[2], 0);
+  EXPECT_EQ(t[3], 0);
+
+  functor(context, &tensor, 1);
+
+  EXPECT_EQ(t[0], 1);
+  EXPECT_EQ(t[1], 1);
+  EXPECT_EQ(t[2], 1);
+  EXPECT_EQ(t[3], 1);
+}
+
+template <typename T>
+void GemvTest(int m, int n, bool trans) {
+  paddle::framework::Tensor mat_a;
+  paddle::framework::Tensor vec_b;
+  paddle::framework::Tensor vec_c;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  int b_num = trans ? m : n;
+  int c_num = trans ? n : m;
+
+  T* data_a = mat_a.mutable_data<T>({m, n}, *cpu_place);
+  T* data_b = vec_b.mutable_data<T>({b_num}, *cpu_place);
+  T* data_c = vec_c.mutable_data<T>({c_num}, *cpu_place);
+  for (int i = 0; i < mat_a.numel(); ++i) {
+    data_a[i] = static_cast<T>(i);
+  }
+  for (int i = 0; i < vec_b.numel(); ++i) {
+    data_b[i] = static_cast<T>(i);
+  }
+
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  paddle::operators::math::gemv<paddle::platform::CPUDeviceContext, T>(
+      context, trans, static_cast<int>(m), static_cast<int>(n), 1., data_a,
+      data_b, 0., data_c);
+
+  if (!trans) {
+    for (int i = 0; i < m; ++i) {
+      T sum = 0.0;
+      for (int j = 0; j < n; ++j) {
+        sum += data_a[i * n + j] * data_b[j];
+      }
+      ASSERT_FLOAT_EQ(data_c[i], sum);
+    }
+  } else {
+    for (int i = 0; i < n; ++i) {
+      T sum = 0.0;
+      for (int j = 0; j < m; ++j) {
+        sum += data_a[j * n + i] * data_b[j];
+      }
+      ASSERT_FLOAT_EQ(data_c[i], sum);
+    }
+  }
+}
+
+TEST(math_function, gemv) {
+  GemvTest<float>(3, 13, false);
+  GemvTest<double>(4, 5, false);
+  GemvTest<float>(12, 7, true);
+  GemvTest<double>(7, 9, true);
+}
+
+TEST(math_funciton, set_constant) {
+  paddle::framework::Tensor t;
+  t.Resize({10, 10});
+  t.mutable_data<int>(paddle::platform::CPUPlace());
+  auto* ctx = new paddle::platform::CPUDeviceContext();
+  paddle::operators::math::set_constant(*ctx, &t, 10);
+  for (int64_t i = 0; i < t.numel(); ++i) {
+    PADDLE_ENFORCE_EQ(10, t.data<int>()[i]);
+  }
+  delete ctx;
+}
diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2ef53a8209940e52390cfe52978f3b32b6dabae6
--- /dev/null
+++ b/paddle/fluid/operators/math/math_function_test.cu
@@ -0,0 +1,255 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+TEST(math_function, notrans_mul_trans) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor out_gpu;
+  paddle::framework::Tensor out;
+
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr, 6 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+
+  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::Copy(input1, *gpu_place, context, &input2_gpu);
+
+  out_gpu.mutable_data<float>({2, 2}, *gpu_place);
+
+  paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
+      context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
+
+  paddle::framework::Copy(out_gpu, *cpu_place, context, &out);
+
+  float* out_ptr = out.data<float>();
+  context.Wait();
+  EXPECT_EQ(out_ptr[0], 5);
+  EXPECT_EQ(out_ptr[1], 14);
+  EXPECT_EQ(out_ptr[2], 14);
+  EXPECT_EQ(out_ptr[3], 50);
+  delete gpu_place;
+}
+
+TEST(math_function, trans_mul_notrans) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor out_gpu;
+  paddle::framework::Tensor out;
+
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr, 6 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+
+  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::Copy(input1, *gpu_place, context, &input2_gpu);
+
+  out_gpu.mutable_data<float>({3, 3}, *gpu_place);
+
+  paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
+      context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
+
+  paddle::framework::Copy(out_gpu, *cpu_place, context, &out);
+
+  float* out_ptr = out.data<float>();
+  context.Wait();
+  EXPECT_EQ(out_ptr[0], 9);
+  EXPECT_EQ(out_ptr[1], 12);
+  EXPECT_EQ(out_ptr[2], 15);
+  EXPECT_EQ(out_ptr[3], 12);
+  EXPECT_EQ(out_ptr[4], 17);
+  EXPECT_EQ(out_ptr[5], 22);
+  EXPECT_EQ(out_ptr[6], 15);
+  EXPECT_EQ(out_ptr[7], 22);
+  EXPECT_EQ(out_ptr[8], 29);
+  delete gpu_place;
+}
+
+TEST(math_function, gemm_notrans_cublas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor input3_gpu;
+
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
+  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+
+  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::Copy(input2, *gpu_place, context, &input2_gpu);
+  paddle::framework::Copy(input3, *gpu_place, context, &input3_gpu);
+  float* a = input1_gpu.data<float>();
+  float* b = input2_gpu.data<float>();
+  float* c = input3_gpu.mutable_data<float>(*gpu_place);
+
+  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
+      context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
+
+  paddle::framework::Copy(input3_gpu, *cpu_place, context, &input3);
+
+  // numpy code:
+  // a = np.arange(6).reshape(2, 3)
+  // b = np.arange(12).reshape(3, 4)[:, 1:]
+  // c = np.arange(8).reshape(2, 4)[:, 1:]
+  // out = np.arange(8).reshape(2, 4)
+  // out[:, 1:] = np.dot(a, b) + c
+  context.Wait();
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+  delete gpu_place;
+}
+
+TEST(math_function, gemm_trans_cublas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor input3_gpu;
+
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
+  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+
+  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::Copy(input2, *gpu_place, context, &input2_gpu);
+  paddle::framework::Copy(input3, *gpu_place, context, &input3_gpu);
+  float* a = input1_gpu.data<float>();
+  float* b = input2_gpu.data<float>();
+  float* c = input3_gpu.mutable_data<float>(*gpu_place);
+
+  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
+      context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
+
+  paddle::framework::Copy(input3_gpu, *cpu_place, context, &input3);
+  context.Wait();
+
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+  delete gpu_place;
+}
+
+template <typename T>
+void GemvTest(int m, int n, bool trans) {
+  paddle::framework::Tensor mat_a;
+  paddle::framework::Tensor vec_b;
+  paddle::framework::Tensor vec_c;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+
+  T* data_a = mat_a.mutable_data<T>({m, n}, *cpu_place);
+  T* data_b = vec_b.mutable_data<T>({trans ? m : n}, *cpu_place);
+  T* data_c = vec_c.mutable_data<T>({trans ? n : m}, *cpu_place);
+
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
+  paddle::framework::Tensor g_mat_a;
+  paddle::framework::Tensor g_vec_b;
+  paddle::framework::Tensor g_vec_c;
+  T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), *gpu_place);
+  T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), *gpu_place);
+  T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), *gpu_place);
+
+  for (int i = 0; i < mat_a.numel(); ++i) {
+    data_a[i] = static_cast<T>(i);
+  }
+  for (int i = 0; i < vec_b.numel(); ++i) {
+    data_b[i] = static_cast<T>(i);
+  }
+
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+  paddle::framework::Copy(mat_a, *gpu_place, context, &g_mat_a);
+  paddle::framework::Copy(vec_b, *gpu_place, context, &g_vec_b);
+
+  paddle::operators::math::gemv<paddle::platform::CUDADeviceContext, T>(
+      context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
+      g_data_b, 0., g_data_c);
+
+  paddle::framework::Copy(g_vec_c, paddle::platform::CPUPlace(), context,
+                          &vec_c);
+
+  if (!trans) {
+    for (int i = 0; i < m; ++i) {
+      T sum = 0.0;
+      for (int j = 0; j < n; ++j) {
+        sum += data_a[i * n + j] * data_b[j];
+      }
+      ASSERT_FLOAT_EQ(data_c[i], sum);
+    }
+  } else {
+    for (int i = 0; i < n; ++i) {
+      T sum = 0.0;
+      for (int j = 0; j < m; ++j) {
+        sum += data_a[j * n + i] * data_b[j];
+      }
+      ASSERT_FLOAT_EQ(data_c[i], sum);
+    }
+  }
+}
+
+TEST(math_function, gemv) {
+  GemvTest<float>(3, 13, false);
+  GemvTest<double>(3, 13, false);
+  GemvTest<float>(3, 13, true);
+  GemvTest<double>(3, 13, true);
+}
diff --git a/paddle/fluid/operators/math/matmul.h b/paddle/fluid/operators/math/matmul.h
new file mode 100644
index 0000000000000000000000000000000000000000..50f79979d99141c8074c55d2b767285dced49f60
--- /dev/null
+++ b/paddle/fluid/operators/math/matmul.h
@@ -0,0 +1,145 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// Implements the logic of numpy matmul:
+// https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html
+//
+// but allowing also for a, b to be transposed
+//
+// Both a & b can be 1- to 3-dimensional. Higher rank tensors are not supported
+// yet.
+template <typename DeviceContext, typename T>
+class MatMulFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& a,
+                  bool trans_a, const framework::Tensor& b, bool trans_b,
+                  T alpha, framework::Tensor* out, T beta) {
+    auto dim_a = a.dims();
+    auto dim_b = b.dims();
+
+    PADDLE_ENFORCE(a.place() == b.place() && b.place() == out->place(),
+                   "Tensors must all be in the same place.");
+    PADDLE_ENFORCE_GE(dim_a.size(), 1,
+                      "Input tensor a must be at least 1-dimensional.");
+    PADDLE_ENFORCE_GE(dim_b.size(), 1,
+                      "Input tensor b must be at least 1-dimensional.");
+
+    std::vector<int64_t> out_dim;
+    int64_t batch_count = 1;
+    if (dim_a.size() > 3) {
+      PADDLE_ENFORCE(dim_b.size() == dim_a.size(),
+                     "The dimensions of X and Y must be the same, and both of "
+                     "them should be %d-dimensional.",
+                     dim_b.size());
+      // The first rank-2 dimensions are accumulated on the batch_count, and the
+      // last two dimensions are used for matrix multiplication.
+      for (int j = 0; j < dim_a.size() - 2; ++j) {
+        PADDLE_ENFORCE_EQ(dim_b[j], dim_a[j],
+                          "The %d-th dimension of X and Y must be the same.",
+                          j);
+        out_dim.push_back(dim_a[j]);
+        batch_count *= dim_a[j];
+      }
+    }
+
+    int M = 0, N = 0, kA = 0, kB = 0, batchCountA = 0, batchCountB = 0,
+        strideA = 0, strideB = 0;
+
+    switch (dim_a.size()) {
+      case 1:
+        // similar to np.matmul:
+        // prepend dimension 1 (no transpose) or append dimension 1 (transpose)
+        M = trans_a ? dim_a[0] : 1;
+        kA = trans_a ? 1 : dim_a[0];
+        break;
+      case 2:
+        M = trans_a ? dim_a[1] : dim_a[0];
+        kA = trans_a ? dim_a[0] : dim_a[1];
+        break;
+      case 3:
+        batchCountA = dim_a[0];
+        M = trans_a ? dim_a[2] : dim_a[1];
+        kA = trans_a ? dim_a[1] : dim_a[2];
+        strideA = M * kA;
+        break;
+      default:
+        batchCountA = batch_count;
+        size_t mat_s = dim_a.size() - 2;
+        M = trans_a ? dim_a[mat_s + 1] : dim_a[mat_s];
+        kA = trans_a ? dim_a[mat_s] : dim_a[mat_s + 1];
+        strideA = M * kA;
+    }
+
+    switch (dim_b.size()) {
+      case 1:
+        // similar to np.matmul:
+        // append dimension 1 (no transpose) or prepend dimension 1 (transpose)
+        kB = trans_b ? 1 : dim_b[0];
+        N = trans_b ? dim_b[0] : 1;
+        break;
+      case 2:
+        kB = trans_b ? dim_b[1] : dim_b[0];
+        N = trans_b ? dim_b[0] : dim_b[1];
+        break;
+      case 3:
+        batchCountB = dim_b[0];
+        kB = trans_b ? dim_b[2] : dim_b[1];
+        N = trans_b ? dim_b[1] : dim_b[2];
+        strideB = kB * N;
+        break;
+      default:
+        batchCountB = batch_count;
+        size_t mat_s = dim_b.size() - 2;
+        kB = trans_b ? dim_b[mat_s + 1] : dim_b[mat_s];
+        N = trans_b ? dim_b[mat_s] : dim_b[mat_s + 1];
+        strideB = kB * N;
+    }
+
+    PADDLE_ENFORCE_EQ(
+        kA, kB,
+        "First matrix's width must be equal with second matrix's height.");
+    if (batchCountA && batchCountB) {
+      PADDLE_ENFORCE_EQ(
+          batchCountA, batchCountB,
+          "When input tensors a and b are both batched, they must have the "
+          "same batch dimension.");
+    }
+    int batchCount = std::max(batchCountA, batchCountB);
+
+    CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+    CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
+
+    if (!batchCount) {
+      // regular matrix multiplication
+      gemm<DeviceContext, T>(context, transA, transB, M, N, kA, alpha,
+                             a.data<T>(), b.data<T>(), beta, out->data<T>());
+    } else {
+      // batched matrix multiplication
+      batched_gemm<DeviceContext, T>(
+          context, transA, transB, M, N, kA, alpha, a.data<T>(), b.data<T>(),
+          beta, out->data<T>(), batchCount, strideA, strideB);
+    }
+  }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.cc b/paddle/fluid/operators/math/maxouting.cc
new file mode 100644
index 0000000000000000000000000000000000000000..746328cd45ada637132ac661c87f4ac4710aeaa4
--- /dev/null
+++ b/paddle/fluid/operators/math/maxouting.cc
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/maxouting.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// All tensors are in NCHW format, and the groups must be greater than 1
+template <typename T>
+class MaxOutFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    int fea_size = input_height * input_width;
+    // c_size means the output size of each sample
+    int c_size = fea_size * output_channels;
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; ++i) {
+      int new_bindex = c_size * i;
+      for (int c = 0; c < output_channels; ++c) {
+        int new_cindex = fea_size * c;
+        for (int f = 0; f < fea_size; ++f) {
+          T ele = static_cast<T>(-FLT_MAX);
+          for (int ph = 0; ph < groups; ++ph) {
+            T x = input_data[(new_bindex + new_cindex) * groups +
+                             ph * fea_size + f];
+            ele = ele > x ? ele : x;
+          }
+          output_data[(new_bindex + new_cindex + f)] = ele;
+        }
+      }
+    }
+  }
+};
+
+template <class T>
+class MaxOutGradFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    int fea_size = input_height * input_width;
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; ++i) {
+      int blen = fea_size * output_channels * i;
+      for (int c = 0; c < output_channels; ++c) {
+        int clen = fea_size * c;
+        for (int f = 0; f < fea_size; ++f) {
+          int input_idx0 = (blen + clen) * groups + f;
+          bool continue_match = true;
+          int output_idx = blen + clen + f;
+          for (int g = 0; g < groups && continue_match; ++g) {
+            int input_idx = input_idx0 + fea_size * g;
+            if (input_data[input_idx] == output_data[output_idx]) {
+              input_grad_data[input_idx] += output_grad_data[output_idx];
+              continue_match = false;
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class MaxOutGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxOutGradFunctor<platform::CPUDeviceContext, double>;
+template class MaxOutFunctor<platform::CPUDeviceContext, float>;
+template class MaxOutFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu
new file mode 100644
index 0000000000000000000000000000000000000000..68e5dfc3c551958c1f201341b8a704d9306ef150
--- /dev/null
+++ b/paddle/fluid/operators/math/maxouting.cu
@@ -0,0 +1,147 @@
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/maxouting.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+__global__ void KernelMaxOut(const int nthreads, const T* input_data,
+                             const int channels, const int input_height,
+                             const int input_width, int groups,
+                             T* output_data) {
+  const int size = input_height * input_width * channels / groups;
+  const int feat_len = input_height * input_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int batch_idx = i / size;
+    int batch_offset = i % size;
+    int channel_idx = batch_offset / feat_len;
+    int feat_idx = batch_offset % feat_len;
+    int data_idx =
+        (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
+    T ele = static_cast<T>(-FLT_MAX);
+    for (int g = 0; g < groups; ++g) {
+      T x = input_data[data_idx + g * feat_len];
+      ele = ele > x ? ele : x;
+    }
+    output_data[i] = ele;
+  }
+}
+template <typename T>
+__global__ void KernelMaxoutGrad(const int nthreads, const T* input_data,
+                                 const T* output_data, const T* output_grad,
+                                 T* input_grad, const int channels,
+                                 const int input_height, const int input_width,
+                                 int groups) {
+  const int size = input_height * input_width * channels / groups;
+  const int feat_len = input_height * input_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int batch_idx = i / size;
+    int batch_offset = i % size;
+    int channel_idx = batch_offset / feat_len;
+    int feat_idx = batch_offset % feat_len;
+    int data_idx =
+        (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
+    int max_index = -1;
+    bool continue_match = true;
+    for (int g = 0; g < groups && continue_match; ++g) {
+      if (input_data[data_idx + g * feat_len] == output_data[i]) {
+        max_index = data_idx + g * feat_len;
+        continue_match = false;
+        break;
+      }
+    }
+    if (max_index != -1) {
+      input_grad[max_index] += output_grad[index];
+    }
+  }
+}
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class MaxOutFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int nthreads = output->numel();
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxOut<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, input_channels, input_height, input_width, groups,
+        output_data);
+  }
+};
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class MaxOutGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int nthreads = output.numel();
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxoutGrad<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_height, input_width, groups);
+  }
+};
+
+template class MaxOutGradFunctor<platform::CUDADeviceContext, float>;
+template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
+
+template class MaxOutFunctor<platform::CUDADeviceContext, float>;
+template class MaxOutFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e81790f0aba422f6676cd329def95a642a12239
--- /dev/null
+++ b/paddle/fluid/operators/math/maxouting.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define FLT_MAX __FLT_MAX__
+
+template <typename DeviceContext, typename T>
+class MaxOutFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* output, int groups);
+};
+
+template <typename DeviceContext, class T>
+class MaxOutGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, int groups);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9adb142f14ea984c598855cf30838f0a1f2e5015
--- /dev/null
+++ b/paddle/fluid/operators/math/pooling.cc
@@ -0,0 +1,760 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/pooling.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename PoolProcess, typename T>
+class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+
+            T ele = pool_process.initial();
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                pool_process.compute(ele, input_data[h * input_width + w]);
+              }
+            }
+            int pool_size = (hend - hstart) * (wend - wstart);
+            pool_process.finalize(ele, (static_cast<T>(pool_size)));
+            output_data[ph * output_width + pw] = ele;
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+* All tensors are in NCHW format.
+* Ksize, strides, paddings are two elements. These two elements represent height
+* and width, respectively.
+*/
+template <typename PoolProcess, class T>
+class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_grad_process,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+            int pool_size = (hend - hstart) * (wend - wstart);
+            float scale = 1.0 / pool_size;
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                pool_grad_process.compute(
+                    input_data[h * input_width + w],
+                    output_data[ph * output_width + pw],
+                    output_grad_data[ph * output_width + pw],
+                    input_grad_data[h * input_width + w],
+                    static_cast<T>(scale));
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <class T>
+class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+
+            bool stop = false;
+            for (int h = hstart; h < hend && !stop; ++h) {
+              for (int w = wstart; w < wend && !stop; ++w) {
+                int input_idx = h * input_width + w;
+                int output_idx = ph * output_width + pw;
+                if (input_data[input_idx] == output_data[output_idx]) {
+                  input_grad_data[input_idx] += output_grad_data[output_idx];
+                  stop = true;
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool2dGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxPool2dGradFunctor<platform::CPUDeviceContext, double>;
+
+template class Pool2dFunctor<platform::CPUDeviceContext,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool2dFunctor<platform::CPUDeviceContext,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool2dFunctor<platform::CPUDeviceContext,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool2dFunctor<platform::CPUDeviceContext,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<double>,
+                                 double>;
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename PoolProcess, class T>
+class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+              int output_idx = (pd * output_height + ph) * output_width + pw;
+              T ele = pool_process.initial();
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    pool_process.compute(
+                        ele,
+                        input_data[(d * input_height + h) * input_width + w]);
+                  }
+                }
+              }
+              int pool_size =
+                  (dend - dstart) * (hend - hstart) * (wend - wstart);
+              pool_process.finalize(ele, static_cast<T>(pool_size));
+              output_data[output_idx] = ele;
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename PoolProcess, class T>
+class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_grad_process,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+
+              int pool_size =
+                  (dend - dstart) * (hend - hstart) * (wend - wstart);
+              float scale = 1.0 / pool_size;
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    int input_idx = (d * input_height + h) * input_width + w;
+                    int output_idx =
+                        (pd * output_height + ph) * output_width + pw;
+                    pool_grad_process.compute(
+                        input_data[input_idx], output_data[output_idx],
+                        output_grad_data[output_idx],
+                        input_grad_data[input_idx], static_cast<T>(scale));
+                  }
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <class T>
+class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+              bool stop = false;
+              for (int d = dstart; d < dend && !stop; ++d) {
+                for (int h = hstart; h < hend && !stop; ++h) {
+                  for (int w = wstart; w < wend && !stop; ++w) {
+                    int input_idx = (d * input_height + h) * input_width + w;
+                    int output_idx =
+                        (pd * output_height + ph) * output_width + pw;
+
+                    if (input_data[input_idx] == output_data[output_idx]) {
+                      input_grad_data[input_idx] +=
+                          output_grad_data[output_idx];
+                      stop = true;
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool3dGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxPool3dGradFunctor<platform::CPUDeviceContext, double>;
+
+template class Pool3dFunctor<platform::CPUDeviceContext,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool3dFunctor<platform::CPUDeviceContext,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool3dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool3dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool3dFunctor<platform::CPUDeviceContext,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool3dFunctor<platform::CPUDeviceContext,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool3dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool3dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<double>,
+                                 double>;
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T1, typename T2>
+class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(context.GetPlace());
+    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+
+            T1 ele = static_cast<T1>(-FLT_MAX);
+            int index = -1;
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                if (ele < input_data[h * input_width + w]) {
+                  ele = input_data[h * input_width + w];
+                  index = h * input_width + w;
+                }
+              }
+            }
+            output_data[ph * output_width + pw] = ele;
+            mask_data[ph * output_width + pw] = index;
+          }
+        }
+        // offset
+        input_data += input_stride;
+        output_data += output_stride;
+        mask_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T1, typename T2>
+class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_height = input_grad->dims()[2];
+    const int input_width = input_grad->dims()[3];
+    const int output_channels = output_grad.dims()[1];
+    const int output_height = output_grad.dims()[2];
+    const int output_width = output_grad.dims()[3];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T2* mask_data = mask.data<T2>();
+    const T1* output_grad_data = output_grad.data<T1>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
+
+    for (int n = 0; n < batch_size; ++n) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          for (int pw = 0; pw < output_width; ++pw) {
+            const int output_idx = ph * output_width + pw;
+            const int input_idx = static_cast<int>(mask_data[output_idx]);
+            input_grad_data[input_idx] += output_grad_data[output_idx];
+          }
+        }
+        // offset
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+        mask_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, float,
+                                         int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, float,
+                                             int>;
+template class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, double,
+                                         int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, double,
+                                             int>;
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename T1, typename T2>
+class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(context.GetPlace());
+    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+
+              int output_idx = (pd * output_height + ph) * output_width + pw;
+              T1 ele = static_cast<T1>(-FLT_MAX);
+              int index = -1;
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    int input_idx = (d * input_height + h) * input_width + w;
+                    if (ele < input_data[input_idx]) {
+                      index = input_idx;
+                      ele = input_data[input_idx];
+                    }
+                  }
+                }
+              }
+              output_data[output_idx] = ele;
+              mask_data[output_idx] = index;
+            }
+          }
+        }
+        // offset
+        input_data += input_stride;
+        output_data += output_stride;
+        mask_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename T1, typename T2>
+class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_depth = input_grad->dims()[2];
+    const int input_height = input_grad->dims()[3];
+    const int input_width = input_grad->dims()[4];
+    const int output_channels = output_grad.dims()[1];
+    const int output_depth = output_grad.dims()[2];
+    const int output_height = output_grad.dims()[3];
+    const int output_width = output_grad.dims()[4];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T2* mask_data = mask.data<T2>();
+    const T1* output_grad_data = output_grad.data<T1>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
+
+    for (int n = 0; n < batch_size; ++n) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          for (int ph = 0; ph < output_height; ++ph) {
+            for (int pw = 0; pw < output_width; ++pw) {
+              const int output_idx =
+                  (pd * output_height + ph) * output_width + pw;
+              const int input_idx = static_cast<int>(mask_data[output_idx]);
+              input_grad_data[input_idx] += output_grad_data[output_idx];
+            }
+          }
+        }
+        // offset
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+        mask_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, float,
+                                         int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, float,
+                                             int>;
+template class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, double,
+                                         int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, double,
+                                             int>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c65632de9066251a94ab3c32d4382d44f4120c2a
--- /dev/null
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -0,0 +1,1041 @@
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/pooling.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename PoolProcess, typename T>
+__global__ void KernelPool2D(const int nthreads, const T* input_data,
+                             const int channels, const int input_height,
+                             const int input_width, const int output_height,
+                             const int output_width, const int ksize_height,
+                             const int ksize_width, const int stride_height,
+                             const int stride_width, const int padding_height,
+                             const int padding_width, PoolProcess pool_process,
+                             T* output_data) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int c = (index / output_width / output_height) % channels;
+    int batch_idx = index / output_width / output_height / channels;
+
+    int hstart = ph * stride_height - padding_height;
+    int hend = min(hstart + ksize_height, input_height);
+    hstart = max(hstart, 0);
+
+    int wstart = pw * stride_width - padding_width;
+    int wend = min(wstart + ksize_width, input_width);
+    wstart = max(wstart, 0);
+
+    input_data += (batch_idx * channels + c) * input_height * input_width;
+    T ele = pool_process.initial();
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        pool_process.compute(ele, input_data[h * input_width + w]);
+      }
+    }
+    int pool_size = (hend - hstart) * (wend - wstart);
+    pool_process.finalize(ele, (static_cast<T>(pool_size)));
+    output_data[index] = ele;
+  }
+}
+
+template <typename PoolProcess, typename T>
+__global__ void KernelPool2DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, const int channels, const int input_height,
+    const int input_width, const int output_height, const int output_width,
+    const int ksize_height, const int ksize_width, const int stride_height,
+    const int stride_width, const int padding_height, const int padding_width,
+    PoolProcess pool_process, T* input_grad) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int offsetW = index % input_width + padding_width;
+    int offsetH = (index / input_width) % input_height + padding_height;
+    int offsetC = (index / input_width / input_height) % channels;
+    int batch_idx = index / input_width / input_height / channels;
+
+    int phstart = (offsetH < ksize_height)
+                      ? 0
+                      : (offsetH - ksize_height) / stride_height + 1;
+    int pwstart = (offsetW < ksize_width)
+                      ? 0
+                      : (offsetW - ksize_width) / stride_width + 1;
+    int phend = min(offsetH / stride_height + 1, output_height);
+    int pwend = min(offsetW / stride_width + 1, output_width);
+    T gradient = 0;
+    T input = input_data[index];
+    int output_idx =
+        (batch_idx * channels + offsetC) * output_height * output_width;
+    output_data += output_idx;
+    output_grad += output_idx;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        int hstart = ph * stride_height - padding_height;
+        int wstart = pw * stride_width - padding_width;
+        int hend = min(hstart + ksize_height, input_height);
+        int wend = min(wstart + ksize_width, input_width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        int pool_size = (hend - hstart) * (wend - wstart);
+        int output_sub_idx = ph * output_width + pw;
+        pool_process.compute(input, output_data[output_sub_idx],
+                             output_grad[output_sub_idx], gradient,
+                             static_cast<T>(1.0 / pool_size));
+      }
+    }
+    input_grad[index] = gradient;
+  }
+}
+
+template <typename T>
+__global__ void KernelMaxPool2DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, const int channels, const int input_height,
+    const int input_width, const int output_height, const int output_width,
+    const int ksize_height, const int ksize_width, const int stride_height,
+    const int stride_width, const int padding_height, const int padding_width,
+    T* input_grad) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int c = (index / output_width / output_height) % channels;
+    int batch_idx = index / output_width / output_height / channels;
+
+    int hstart = ph * stride_height - padding_height;
+    int hend = min(hstart + ksize_height, input_height);
+    hstart = max(hstart, 0);
+
+    int wstart = pw * stride_width - padding_width;
+    int wend = min(wstart + ksize_width, input_width);
+    wstart = max(wstart, 0);
+
+    input_data += (batch_idx * channels + c) * input_height * input_width;
+    input_grad += (batch_idx * channels + c) * input_height * input_width;
+
+    T ele = output_data[index];
+    int maxIndex = -1;
+    bool stop = false;
+    for (int h = hstart; h < hend && !stop; ++h) {
+      for (int w = wstart; w < wend && !stop; ++w) {
+        if (ele == input_data[h * input_width + w]) {
+          maxIndex = h * input_width + w;
+          stop = true;
+        }
+      }
+    }
+
+    if (maxIndex != -1) {
+      // atomic add
+      platform::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]);
+    }
+  }
+}
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename PoolProcess, typename T>
+class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, input_channels, input_height, input_width,
+        output_height, output_width, ksize_height, ksize_width, stride_height,
+        stride_width, padding_height, padding_width, pool_process, output_data);
+  }
+};
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename PoolProcess, typename T>
+class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * input_channels * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool2DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_height, input_width, output_height, output_width, ksize_height,
+        ksize_width, stride_height, stride_width, padding_height, padding_width,
+        pool_process, input_grad_data);
+  }
+};
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T>
+class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_height, input_width, output_height, output_width, ksize_height,
+        ksize_width, stride_height, stride_width, padding_height, padding_width,
+        input_grad_data);
+  }
+};
+
+template class MaxPool2dGradFunctor<platform::CUDADeviceContext, float>;
+template class MaxPool2dGradFunctor<platform::CUDADeviceContext, double>;
+
+template class Pool2dFunctor<platform::CUDADeviceContext,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool2dFunctor<platform::CUDADeviceContext,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool2dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool2dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool2dFunctor<platform::CUDADeviceContext,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool2dFunctor<platform::CUDADeviceContext,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool2dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool2dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<double>,
+                                 double>;
+
+template <typename PoolProcess, typename T>
+__global__ void KernelPool3D(const int nthreads, const T* input_data,
+                             const int channels, const int input_depth,
+                             const int input_height, const int input_width,
+                             const int output_depth, const int output_height,
+                             const int output_width, const int ksize_depth,
+                             const int ksize_height, const int ksize_width,
+                             const int stride_depth, const int stride_height,
+                             const int stride_width, const int padding_depth,
+                             const int padding_height, const int padding_width,
+                             PoolProcess pool_process, T* output_data) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int pd = (index / output_width / output_height) % output_depth;
+    int c = (index / output_width / output_height / output_depth) % channels;
+    int batch_idx =
+        index / output_width / output_height / output_depth / channels;
+    int dstart = pd * stride_depth - padding_depth;
+    int hstart = ph * stride_height - padding_height;
+    int wstart = pw * stride_width - padding_width;
+    int dend = min(dstart + ksize_depth, input_depth);
+    int hend = min(hstart + ksize_height, input_height);
+    int wend = min(wstart + ksize_width, input_width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T ele = pool_process.initial();
+    input_data +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          pool_process.compute(
+              ele, input_data[(d * input_height + h) * input_width + w]);
+        }
+      }
+    }
+    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    pool_process.finalize(ele, static_cast<T>(pool_size));
+    output_data[index] = ele;
+  }
+}
+
+template <typename PoolProcess, typename T>
+__global__ void KernelPool3DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, const int channels, const int input_depth,
+    const int input_height, const int input_width, const int output_depth,
+    const int output_height, const int output_width, const int ksize_depth,
+    const int ksize_height, const int ksize_width, const int stride_depth,
+    const int stride_height, const int stride_width, const int padding_depth,
+    const int padding_height, const int padding_width, PoolProcess pool_process,
+    T* input_grad) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int offsetW = index % input_width + padding_width;
+    int offsetH = (index / input_width) % input_height + padding_height;
+    int offsetD =
+        (index / input_width / input_height) % input_depth + padding_depth;
+    int offsetC = (index / input_width / input_height / input_depth) % channels;
+    int batch_idx = index / input_width / input_height / input_depth / channels;
+
+    int pdstart = (offsetD < ksize_depth)
+                      ? 0
+                      : (offsetD - ksize_depth) / stride_depth + 1;
+    int phstart = (offsetH < ksize_height)
+                      ? 0
+                      : (offsetH - ksize_height) / stride_height + 1;
+    int pwstart = (offsetW < ksize_width)
+                      ? 0
+                      : (offsetW - ksize_width) / stride_width + 1;
+    int pdend = min((offsetD) / stride_depth + 1, output_depth);
+    int phend = min((offsetH) / stride_height + 1, output_height);
+    int pwend = min((offsetW) / stride_width + 1, output_width);
+
+    T gradient = 0;
+    T input = input_data[index];
+    int output_idx = (batch_idx * channels + offsetC) * output_depth *
+                     output_height * output_width;
+    output_data += output_idx;
+    output_grad += output_idx;
+
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          // figure out the pooling size
+          int dstart = pd * stride_depth - padding_depth;
+          int hstart = ph * stride_height - padding_height;
+          int wstart = pw * stride_width - padding_width;
+          int dend = min(dstart + ksize_depth, input_depth);
+          int hend = min(hstart + ksize_height, input_height);
+          int wend = min(wstart + ksize_width, input_width);
+          dstart = max(dstart, 0);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+          int output_sub_idx = (pd * output_height + ph) * output_width + pw;
+          pool_process.compute(input, output_data[output_sub_idx],
+                               output_grad[output_sub_idx], gradient,
+                               static_cast<T>(1.0 / pool_size));
+        }
+      }
+    }
+    input_grad[index] = gradient;
+  }
+}
+
+template <typename T>
+__global__ void KernelMaxPool3DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, const int channels, const int input_depth,
+    const int input_height, const int input_width, const int output_depth,
+    const int output_height, const int output_width, const int ksize_depth,
+    const int ksize_height, const int ksize_width, const int stride_depth,
+    const int stride_height, const int stride_width, const int padding_depth,
+    const int padding_height, const int padding_width, T* input_grad) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int pd = (index / output_width / output_height) % output_depth;
+    int c = (index / output_width / output_height / output_depth) % channels;
+    int batch_idx =
+        index / output_width / output_height / output_depth / channels;
+    int dstart = pd * stride_depth - padding_depth;
+    int hstart = ph * stride_height - padding_height;
+    int wstart = pw * stride_width - padding_width;
+    int dend = min(dstart + ksize_depth, input_depth);
+    int hend = min(hstart + ksize_height, input_height);
+    int wend = min(wstart + ksize_width, input_width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T ele = output_data[index];
+    bool stop = false;
+    int maxIdx = -1;
+    input_data +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+    input_grad +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+
+    for (int d = dstart; d < dend && !stop; ++d) {
+      for (int h = hstart; h < hend && !stop; ++h) {
+        for (int w = wstart; w < wend && !stop; ++w) {
+          if (ele == input_data[(d * input_height + h) * input_width + w]) {
+            stop = true;
+            maxIdx = (d * input_height + h) * input_width + w;
+          }
+        }
+      }
+    }
+    if (maxIdx != -1) {
+      // atomic add
+      platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]);
+    }
+  }
+}
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename PoolProcess, class T>
+class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_depth * output_height *
+                   output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, input_channels, input_depth, input_height,
+        input_width, output_depth, output_height, output_width, ksize_depth,
+        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
+        padding_depth, padding_height, padding_width, pool_process,
+        output_data);
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename PoolProcess, class T>
+class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    int nthreads =
+        batch_size * input_channels * input_depth * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool3DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_depth, input_height, input_width, output_depth, output_height,
+        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
+        stride_height, stride_width, padding_depth, padding_height,
+        padding_width, pool_process, input_grad_data);
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <class T>
+class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_depth * output_height *
+                   output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool3DGrad<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_depth, input_height, input_width, output_depth, output_height,
+        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
+        stride_height, stride_width, padding_depth, padding_height,
+        padding_width, input_grad_data);
+  }
+};
+
+template class MaxPool3dGradFunctor<platform::CUDADeviceContext, float>;
+template class MaxPool3dGradFunctor<platform::CUDADeviceContext, double>;
+
+template class Pool3dFunctor<platform::CUDADeviceContext,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool3dFunctor<platform::CUDADeviceContext,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool3dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool3dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool3dFunctor<platform::CUDADeviceContext,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool3dFunctor<platform::CUDADeviceContext,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool3dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool3dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<double>,
+                                 double>;
+
+template <typename T1, typename T2>
+__global__ void KernelMaxPool2dWithIdx(
+    const int nthreads, const T1* input_data, const int channels,
+    const int input_height, const int input_width, const int output_height,
+    const int output_width, const int ksize_height, const int ksize_width,
+    const int stride_height, const int stride_width, const int padding_height,
+    const int padding_width, T1* output_data, T2* mask_data) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int c = (index / output_width / output_height) % channels;
+    int batch_idx = index / output_width / output_height / channels;
+
+    int hstart = ph * stride_height - padding_height;
+    int hend = min(hstart + ksize_height, input_height);
+    hstart = max(hstart, 0);
+
+    int wstart = pw * stride_width - padding_width;
+    int wend = min(wstart + ksize_width, input_width);
+    wstart = max(wstart, 0);
+
+    input_data += (batch_idx * channels + c) * input_height * input_width;
+    T1 ele = -FLT_MAX;
+    int max_index = -1;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int input_index = h * input_width + w;
+        if (ele < input_data[input_index]) {
+          max_index = input_index;
+          ele = input_data[input_index];
+        }
+      }
+    }
+    output_data[index] = ele;
+    mask_data[index] = max_index;
+  }
+}
+
+template <typename T1, typename T2>
+__global__ void KernelMaxPool2DWithIdxGrad(
+    const int nthreads, const T1* output_grad, const T2* mask_data,
+    const int channels, const int input_height, const int input_width,
+    const int output_height, const int output_width, const int ksize_height,
+    const int ksize_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width, T1* input_grad) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int w_offset = index % input_width;
+    int h_offset = (index / input_width) % input_height;
+    int c_offset = (index / input_width / input_height) % channels;
+    int batch_idx = index / input_width / input_height / channels;
+
+    int ph_start =
+        (h_offset + padding_height < ksize_height)
+            ? 0
+            : (h_offset + padding_height - ksize_height) / stride_height + 1;
+    int pw_start =
+        (w_offset + padding_width < ksize_width)
+            ? 0
+            : (w_offset + padding_width - ksize_width) / stride_width + 1;
+    int ph_end =
+        min((h_offset + padding_height) / stride_height + 1, output_height);
+    int pw_end =
+        min((w_offset + padding_width) / stride_width + 1, output_width);
+
+    T1 gradient = 0;
+    int input_current_featuremap_idx = h_offset * input_width + w_offset;
+    int output_idx =
+        (batch_idx * channels + c_offset) * output_height * output_width;
+
+    mask_data += output_idx;
+    output_grad += output_idx;
+    for (int ph = ph_start; ph < ph_end; ++ph) {
+      for (int pw = pw_start; pw < pw_end; ++pw) {
+        if (mask_data[ph * output_width + pw] == input_current_featuremap_idx)
+          gradient += output_grad[ph * output_width + pw];
+      }
+    }
+    input_grad[index] = gradient;
+  }
+}
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T1, typename T2>
+class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(context.GetPlace());
+    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool2dWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, input_channels, input_height, input_width,
+        output_height, output_width, ksize_height, ksize_width, stride_height,
+        stride_width, padding_height, padding_width, output_data, mask_data);
+  }
+};
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T1, typename T2>
+class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_channels = input_grad->dims()[1];
+    const int input_height = input_grad->dims()[2];
+    const int input_width = input_grad->dims()[3];
+    const int output_height = output_grad.dims()[2];
+    const int output_width = output_grad.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T2* mask_data = mask.data<T2>();
+    const T1* output_grad_data = output_grad.data<T1>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
+
+    int nthreads = batch_size * input_channels * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool2DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
+        nthreads, output_grad_data, mask_data, input_channels, input_height,
+        input_width, output_height, output_width, ksize_height, ksize_width,
+        stride_height, stride_width, padding_height, padding_width,
+        input_grad_data);
+  }
+};
+
+template class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, float,
+                                         int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, float,
+                                             int>;
+template class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, double,
+                                         int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext,
+                                             double, int>;
+
+template <typename T1, typename T2>
+__global__ void KernelMaxPool3DWithIdx(
+    const int nthreads, const T1* input_data, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height, const int padding_width,
+    T1* output_data, T2* mask_data) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int pd = (index / output_width / output_height) % output_depth;
+    int c = (index / output_width / output_height / output_depth) % channels;
+    int batch_idx =
+        index / output_width / output_height / output_depth / channels;
+
+    int dstart = pd * stride_depth - padding_depth;
+    int hstart = ph * stride_height - padding_height;
+    int wstart = pw * stride_width - padding_width;
+    int dend = min(dstart + ksize_depth, input_depth);
+    int hend = min(hstart + ksize_height, input_height);
+    int wend = min(wstart + ksize_width, input_width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+
+    T1 ele = -FLT_MAX;
+    int max_index = -1;
+    input_data +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          if (ele < input_data[(d * input_height + h) * input_width + w]) {
+            max_index = (d * input_height + h) * input_width + w;
+            ele = input_data[max_index];
+          }
+        }
+      }
+    }
+    output_data[index] = ele;
+    mask_data[index] = max_index;
+  }
+}
+
+template <typename T1, typename T2>
+__global__ void KernelMaxPool3DWithIdxGrad(
+    const int nthreads, const T1* output_grad, const T2* mask,
+    const int channels, const int input_depth, const int input_height,
+    const int input_width, const int output_depth, const int output_height,
+    const int output_width, const int ksize_depth, const int ksize_height,
+    const int ksize_width, const int stride_depth, const int stride_height,
+    const int stride_width, const int padding_depth, const int padding_height,
+    const int padding_width, T1* input_grad) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int w_offset = index % input_width;
+    int h_offset = (index / input_width) % input_height;
+    int d_offset = (index / input_width / input_height) % input_depth;
+    int c_offset =
+        (index / input_width / input_height / input_depth) % channels;
+    int batch_idx = index / input_width / input_height / input_depth / channels;
+
+    int pd_start =
+        (d_offset + padding_depth < ksize_depth)
+            ? 0
+            : (d_offset + padding_depth - ksize_depth) / stride_depth + 1;
+    int ph_start =
+        (h_offset + padding_height < ksize_height)
+            ? 0
+            : (h_offset + padding_height - ksize_height) / stride_height + 1;
+    int pw_start =
+        (w_offset + padding_width < ksize_width)
+            ? 0
+            : (w_offset + padding_width - ksize_width) / stride_width + 1;
+    int pd_end =
+        min((d_offset + padding_depth) / stride_depth + 1, output_depth);
+    int ph_end =
+        min((h_offset + padding_height) / stride_height + 1, output_height);
+    int pw_end =
+        min((w_offset + padding_width) / stride_width + 1, output_width);
+
+    T1 gradient = 0;
+    int input_current_feature_map_idx =
+        (d_offset * input_height + h_offset) * input_width + w_offset;
+    int output_idx = (batch_idx * channels + c_offset) * output_depth *
+                     output_height * output_width;
+    mask += output_idx;
+    output_grad += output_idx;
+
+    for (int pd = pd_start; pd < pd_end; ++pd) {
+      for (int ph = ph_start; ph < ph_end; ++ph) {
+        for (int pw = pw_start; pw < pw_end; ++pw) {
+          if (mask[(pd * output_height + ph) * output_width + pw] ==
+              input_current_feature_map_idx)
+            gradient +=
+                output_grad[(pd * output_height + ph) * output_width + pw];
+        }
+      }
+    }
+    input_grad[index] = gradient;
+  }
+}
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename T1, typename T2>
+class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(context.GetPlace());
+    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_depth * output_height *
+                   output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool3DWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, input_channels, input_depth, input_height,
+        input_width, output_depth, output_height, output_width, ksize_depth,
+        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
+        padding_depth, padding_height, padding_width, output_data, mask_data);
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename T1, typename T2>
+class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_channels = input_grad->dims()[1];
+    const int input_depth = input_grad->dims()[2];
+    const int input_height = input_grad->dims()[3];
+    const int input_width = input_grad->dims()[4];
+    const int output_depth = output_grad.dims()[2];
+    const int output_height = output_grad.dims()[3];
+    const int output_width = output_grad.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T1* output_grad_data = output_grad.data<T1>();
+    const T2* mask_data = mask.data<T2>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
+
+    int nthreads =
+        batch_size * input_channels * input_depth * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool3DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
+        nthreads, output_grad_data, mask_data, input_channels, input_depth,
+        input_height, input_width, output_depth, output_height, output_width,
+        ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
+        stride_width, padding_depth, padding_height, padding_width,
+        input_grad_data);
+  }
+};
+
+template class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, float,
+                                         int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, float,
+                                             int>;
+template class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, double,
+                                         int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext,
+                                             double, int>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..1195038f6a067fde776df8013a2a81e7003489a1
--- /dev/null
+++ b/paddle/fluid/operators/math/pooling.h
@@ -0,0 +1,192 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define FLT_MAX \
+  __FLT_MAX__  // It might need to be placed in another file, but I'm still
+               // wondering where to put it.
+
+/*
+ * \brief Extracting simple operations from pooling.
+ *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
+ * operation.
+ *        MaxPool initializes temp variable to the negative maximum to find the
+ * maximum value in the pooling field.
+ *        AvgPool initializes temp variable to the zero to accumulate all values
+ * in pool pooling, and finally takes the average.
+ *        MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
+ */
+template <class T>
+class MaxPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
+  DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; }
+  DEVICE inline void finalize(T& y, const T& pool_field) {}
+};
+
+template <class T>
+class AvgPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(0); }
+  DEVICE inline void compute(T& y, const T& x) { y += x; }
+  DEVICE inline void finalize(T& y, const T& pool_field) { y /= pool_field; }
+};
+
+template <class T>
+class MaxPoolGrad {
+ public:
+  DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
+                             T scale) {
+    dx += dy * (x == y);
+  }
+};
+
+template <class T>
+class AvgPoolGrad {
+ public:
+  DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
+                             T scale) {
+    dx += (scale * dy);
+  }
+};
+
+/*
+ * \brief Getting pooling results, and calculating gradient.
+ *
+ * In pool2d, all tensors are in NCHW format. Where N is batch size, C is the
+ * number of channels, H and W is the height and width of feature.
+ * In pool3d, all tensors are in NCDHW format. Where N is batch size, C is the
+ * number of channels, D, H and W is the depth, height and width of feature.
+ *
+ * In max pooling, it is possible that the pooling region has multiple maximum
+ * elements. In this case, we should compute the gradient of the first maximum
+ * element.
+ * This is different from average pooling. So we rewrite the max_pool_grad:
+ * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
+ */
+template <typename DeviceContext, typename PoolProcess, typename T>
+class Pool2dFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_compute,
+                  framework::Tensor* output);
+};
+
+template <typename DeviceContext, typename PoolProcess, typename T>
+class Pool2dGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_compute, framework::Tensor* input_grad);
+};
+
+template <typename DeviceContext, class T>
+class MaxPool2dGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
+};
+
+template <typename DeviceContext, typename PoolProcess, typename T>
+class Pool3dFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_compute,
+                  framework::Tensor* output);
+};
+
+template <typename DeviceContext, typename PoolProcess, typename T>
+class Pool3dGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_compute, framework::Tensor* input_grad);
+};
+
+template <typename DeviceContext, class T>
+class MaxPool3dGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
+};
+
+/*
+ * \brief Getting max pooling results and corresponding max index, and
+ * calculating gradient.
+ * In up-sampling-pooling, it is necessary to know max element index.
+ * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in
+ * NCDHW format.
+ */
+template <typename DeviceContext, typename T1, typename T2>
+class MaxPool2dWithIndexFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, framework::Tensor* output,
+                  framework::Tensor* mask);
+};
+
+template <typename DeviceContext, typename T1, typename T2>
+class MaxPool2dWithIndexGradFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
+};
+
+template <typename DeviceContext, typename T1, typename T2>
+class MaxPool3dWithIndexFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, framework::Tensor* output,
+                  framework::Tensor* mask);
+};
+
+template <typename DeviceContext, typename T1, typename T2>
+class MaxPool3dWithIndexGradFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc
similarity index 100%
rename from paddle/operators/math/sampler.cc
rename to paddle/fluid/operators/math/sampler.cc
diff --git a/paddle/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h
similarity index 100%
rename from paddle/operators/math/sampler.h
rename to paddle/fluid/operators/math/sampler.h
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..01aa37ab35ce906133f6195df5d7014b4fb23d16
--- /dev/null
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -0,0 +1,298 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <set>
+
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::SelectedRows& input2,
+                  framework::SelectedRows* output) {
+    auto in1_height = input1.height();
+    PADDLE_ENFORCE_EQ(in1_height, input2.height());
+    output->set_height(in1_height);
+
+    auto& in1_rows = input1.rows();
+    auto& in2_rows = input2.rows();
+    std::vector<int64_t> out_rows;
+    out_rows.reserve(in1_rows.size() + in2_rows.size());
+
+    // concat rows
+    out_rows.insert(out_rows.end(), in1_rows.begin(), in1_rows.end());
+    out_rows.insert(out_rows.end(), in2_rows.begin(), in2_rows.end());
+    output->set_rows(out_rows);
+
+    auto* out_value = output->mutable_value();
+    auto& in1_value = input1.value();
+    auto& in2_value = input2.value();
+
+    auto in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
+    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+
+    auto in1_place = input1.place();
+    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
+    auto in2_place = input2.place();
+    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
+    auto out_place = context.GetPlace();
+    PADDLE_ENFORCE(platform::is_cpu_place(out_place));
+
+    auto* out_data = out_value->data<T>();
+    auto* in1_data = in1_value.data<T>();
+    memory::Copy(boost::get<platform::CPUPlace>(out_place), out_data,
+                 boost::get<platform::CPUPlace>(in1_place), in1_data,
+                 in1_value.numel() * sizeof(T));
+
+    auto* in2_data = in2_value.data<T>();
+    memory::Copy(boost::get<platform::CPUPlace>(out_place),
+                 out_data + in1_value.numel(),
+                 boost::get<platform::CPUPlace>(in2_place), in2_data,
+                 in2_value.numel() * sizeof(T));
+  }
+};
+
+template struct SelectedRowsAdd<platform::CPUDeviceContext, float>;
+template struct SelectedRowsAdd<platform::CPUDeviceContext, double>;
+
+template <typename T>
+struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::Tensor& input2, framework::Tensor* output) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2.dims();
+    auto out_dims = output->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
+    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
+
+    SetConstant<platform::CPUDeviceContext, T> functor;
+    functor(context, output, 0.0);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* out_data = output->data<T>();
+
+    for (size_t i = 0; i < in1_rows.size(); i++) {
+      for (int64_t j = 0; j < in1_row_numel; j++) {
+        out_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+      }
+    }
+
+    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
+    auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
+    out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
+  }
+};
+
+template struct SelectedRowsAddTensor<platform::CPUDeviceContext, float>;
+template struct SelectedRowsAddTensor<platform::CPUDeviceContext, double>;
+
+template <typename T>
+struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const int64_t input2_offset,
+                  framework::SelectedRows* input2) {
+    auto in1_height = input1.height();
+    PADDLE_ENFORCE_EQ(in1_height, input2->height());
+
+    auto& in1_rows = input1.rows();
+    auto& in2_rows = *(input2->mutable_rows());
+
+    auto& in1_value = input1.value();
+    auto* in2_value = input2->mutable_value();
+
+    // concat rows
+    in2_rows.Extend(in1_rows.begin(), in1_rows.end());
+
+    auto in1_place = input1.place();
+    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
+    auto in2_place = input2->place();
+    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
+
+    auto* in1_data = in1_value.data<T>();
+    auto* in2_data = in2_value->data<T>();
+    memory::Copy(boost::get<platform::CPUPlace>(in2_place),
+                 in2_data + input2_offset,
+                 boost::get<platform::CPUPlace>(in1_place), in1_data,
+                 in1_value.numel() * sizeof(T));
+  }
+};
+
+template struct SelectedRowsAddTo<platform::CPUDeviceContext, float>;
+template struct SelectedRowsAddTo<platform::CPUDeviceContext, double>;
+template struct SelectedRowsAddTo<platform::CPUDeviceContext, int>;
+template struct SelectedRowsAddTo<platform::CPUDeviceContext, int64_t>;
+
+template <typename T>
+struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  framework::Tensor* input2) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* input2_data = input2->data<T>();
+
+    for (size_t i = 0; i < in1_rows.size(); i++) {
+      for (int64_t j = 0; j < in1_row_numel; j++) {
+        input2_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+      }
+    }
+  }
+};
+
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, float>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
+
+// This is a separated namespace for manipulate SelectedRows typed
+// data. Like merge duplicated rows, adding two SelectedRows etc.
+//
+// Another group of functors is called "scatter updates", which means
+// use SelectedRows to update a dense tensor with different Ops, like
+// add or mul.
+namespace scatter {
+
+size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
+  return std::find(rows.begin(), rows.end(), value) - rows.begin();
+}
+
+template <typename T>
+struct MergeAdd<platform::CPUDeviceContext, T> {
+  framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
+                                     const framework::SelectedRows& input) {
+    framework::SelectedRows out;
+    auto input_rows = input.rows();
+    std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
+    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
+
+    auto input_width = input.value().dims()[1];
+    out.set_rows(merge_rows);
+    out.set_height(input.height());
+    out.mutable_value()->mutable_data<T>(
+        framework::make_ddim(
+            {static_cast<int64_t>(merge_rows.size()), input_width}),
+        context.GetPlace());
+
+    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
+    constant_functor(context, out.mutable_value(), 0.0);
+
+    auto* out_data = out.mutable_value()->data<T>();
+    auto* input_data = input.value().data<T>();
+
+    for (size_t i = 0; i < input_rows.size(); i++) {
+      size_t out_i = FindPos(merge_rows, input_rows[i]);
+      for (int64_t j = 0; j < input_width; j++) {
+        out_data[out_i * input_width + j] += input_data[i * input_width + j];
+      }
+    }
+    return out;
+  }
+};
+
+template struct MergeAdd<platform::CPUDeviceContext, float>;
+template struct MergeAdd<platform::CPUDeviceContext, double>;
+template struct MergeAdd<platform::CPUDeviceContext, int>;
+template struct MergeAdd<platform::CPUDeviceContext, int64_t>;
+
+template <typename T>
+struct UpdateToTensor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const ScatterOps& op, const framework::SelectedRows& input1,
+                  framework::Tensor* input2) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* input2_data = input2->data<T>();
+
+    // FIXME(typhoonzero): use macro fix the below messy code.
+    switch (op) {
+      case ScatterOps::ASSIGN:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::ADD:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::SUB:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] -=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::SUBBY:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j] -
+            input2_data[in1_rows[i] * in1_row_numel + j];
+        break;
+      case ScatterOps::MUL:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] *=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::DIV:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] /=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::DIVBY:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j] /
+            input2_data[in1_rows[i] * in1_row_numel + j];
+        break;
+    }
+  }
+};
+
+}  // namespace scatter
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ee3b5d52058f7f04d5eeda1d033171f9eae0d772
--- /dev/null
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -0,0 +1,385 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <set>
+
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::SelectedRows& input2,
+                  framework::SelectedRows* output) {
+    auto in1_height = input1.height();
+    PADDLE_ENFORCE_EQ(in1_height, input2.height());
+    output->set_height(in1_height);
+
+    framework::Vector<int64_t> in1_rows(input1.rows());
+    auto& in2_rows = input2.rows();
+    std::vector<int64_t> out_rows;
+    out_rows.reserve(in1_rows.size() + in2_rows.size());
+
+    // concat rows
+    out_rows.insert(out_rows.end(), in1_rows.begin(), in1_rows.end());
+    out_rows.insert(out_rows.end(), in2_rows.begin(), in2_rows.end());
+    output->set_rows(out_rows);
+
+    auto* out_value = output->mutable_value();
+    auto& in1_value = input1.value();
+    auto& in2_value = input2.value();
+
+    auto in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
+    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+
+    auto* out_data = out_value->data<T>();
+    auto* in1_data = in1_value.data<T>();
+
+    auto in1_place = input1.place();
+    PADDLE_ENFORCE(platform::is_gpu_place(in1_place));
+    auto in2_place = input2.place();
+    PADDLE_ENFORCE(platform::is_gpu_place(in2_place));
+    auto out_place = context.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(out_place));
+
+    memory::Copy(
+        boost::get<platform::CUDAPlace>(out_place), out_data,
+        boost::get<platform::CUDAPlace>(in1_place), in1_data,
+        in1_value.numel() * sizeof(T),
+        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
+
+    auto* in2_data = in2_value.data<T>();
+    memory::Copy(boost::get<platform::CUDAPlace>(out_place),
+                 out_data + in1_value.numel(),
+                 boost::get<platform::CUDAPlace>(in2_place), in2_data,
+                 in2_value.numel() * sizeof(T), context.stream());
+  }
+};
+
+template struct SelectedRowsAdd<platform::CUDADeviceContext, float>;
+template struct SelectedRowsAdd<platform::CUDADeviceContext, double>;
+
+namespace {
+template <typename T, int block_size>
+__global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
+                                            const int64_t* rows, T* tensor_out,
+                                            int64_t row_numel) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  selected_rows += ty * row_numel;
+  tensor_out += rows[ty] * row_numel;
+
+  for (int index = tid; index < row_numel; index += block_size) {
+    // Since index in rows of SelectedRows can be duplicate, we can not use
+    // tensor_out[index] += selected_rows[index]; Instead, we have to use
+    // AtomicAdd to avoid concurrent write error.
+    paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
+  }
+}
+}  // namespace
+
+template <typename T>
+struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::Tensor& input2, framework::Tensor* output) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2.dims();
+    auto out_dims = output->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
+
+    auto& in1_value = input1.value();
+    framework::Vector<int64_t> in1_rows(input1.rows());
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
+    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* in2_data = input2.data<T>();
+    auto* out_data = output->data<T>();
+
+    SetConstant<platform::CUDADeviceContext, T> functor;
+    functor(context, output, 0.0);
+
+    const int block_size = 256;
+    dim3 threads(block_size, 1);
+    dim3 grid(1, in1_rows.size());
+    SelectedRowsAddTensorKernel<
+        T, block_size><<<grid, threads, 0, context.stream()>>>(
+        in1_data, in1_rows.CUDAData(context.GetPlace()), out_data,
+        in1_row_numel);
+
+    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
+    auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
+    out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
+  }
+};
+
+template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
+template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
+
+template <typename T>
+struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const int64_t input2_offset,
+                  framework::SelectedRows* input2) {
+    auto in1_height = input1.height();
+    PADDLE_ENFORCE_EQ(in1_height, input2->height());
+
+    framework::Vector<int64_t> in1_rows(input1.rows());
+    auto& in2_rows = *(input2->mutable_rows());
+
+    auto& in1_value = input1.value();
+    auto* in2_value = input2->mutable_value();
+
+    // concat rows
+    if (in1_rows.size()) {
+      in2_rows.Extend(in1_rows.begin(), in1_rows.end());
+    }
+
+    auto in1_place = input1.place();
+    PADDLE_ENFORCE(platform::is_gpu_place(in1_place));
+    auto in2_place = input2->place();
+    PADDLE_ENFORCE(platform::is_gpu_place(in2_place));
+
+    auto* in1_data = in1_value.data<T>();
+    auto* in2_data = in2_value->data<T>();
+    memory::Copy(boost::get<platform::CUDAPlace>(in2_place),
+                 in2_data + input2_offset,
+                 boost::get<platform::CUDAPlace>(in1_place), in1_data,
+                 in1_value.numel() * sizeof(T), context.stream());
+  }
+};
+
+template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
+template struct SelectedRowsAddTo<platform::CUDADeviceContext, double>;
+template struct SelectedRowsAddTo<platform::CUDADeviceContext, int>;
+template struct SelectedRowsAddTo<platform::CUDADeviceContext, int64_t>;
+
+namespace {
+template <typename T, int block_size>
+__global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
+                                              const int64_t* rows,
+                                              T* tensor_out,
+                                              int64_t row_numel) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  selected_rows += ty * row_numel;
+  tensor_out += rows[ty] * row_numel;
+
+  for (int index = tid; index < row_numel; index += block_size) {
+    // Since index in rows of SelectedRows can be duplicate, we have to use
+    // Atomic Operation to avoid concurrent write error.
+    paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
+  }
+}
+}  // namespace
+
+template <typename T>
+struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  framework::Tensor* input2) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+
+    auto& in1_value = input1.value();
+    framework::Vector<int64_t> in1_rows(input1.rows());
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* in2_data = input2->data<T>();
+    const int block_size = 256;
+    dim3 threads(block_size, 1);
+    dim3 grid(1, in1_rows.size());
+    SelectedRowsAddToTensorKernel<
+        T, block_size><<<grid, threads, 0, context.stream()>>>(
+        in1_data, in1_rows.CUDAData(context.GetPlace()), in2_data,
+        in1_row_numel);
+  }
+};
+
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
+
+namespace scatter {
+
+template <typename T, int block_size>
+__global__ void MergeAddKernel(const T* input, const int64_t* input_rows,
+                               T* out, const int64_t* out_rows,
+                               size_t out_rows_size, int64_t row_numel) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+  __shared__ size_t out_idx;
+
+  if (tid == 0) {
+    for (size_t i = 0; i < out_rows_size; i++) {
+      if (input_rows[ty] == out_rows[i]) {
+        out_idx = i;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  input += ty * row_numel;
+  out += out_idx * row_numel;
+  for (int index = tid; index < row_numel; index += block_size) {
+    paddle::platform::CudaAtomicAdd(out + index, input[index]);
+  }
+}
+
+template <typename T>
+struct MergeAdd<platform::CUDADeviceContext, T> {
+  framework::SelectedRows operator()(const platform::CUDADeviceContext& context,
+                                     const framework::SelectedRows& input) {
+    framework::SelectedRows out;
+    framework::Vector<int64_t> input_rows(input.rows());
+    std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
+    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
+
+    auto input_width = input.value().dims()[1];
+
+    out.set_rows(merge_rows);
+    out.set_height(input.height());
+    out.mutable_value()->mutable_data<T>(
+        framework::make_ddim(
+            {static_cast<int64_t>(merge_rows.size()), input_width}),
+        context.GetPlace());
+
+    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
+    constant_functor(context, out.mutable_value(), 0.0);
+
+    auto* out_data = out.mutable_value()->data<T>();
+    auto* input_data = input.value().data<T>();
+
+    const int block_size = 256;
+    dim3 threads(block_size, 1);
+    dim3 grid1(1, input_rows.size());
+
+    MergeAddKernel<
+        T, 256><<<grid1, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(
+        input_data, input_rows.CUDAData(context.GetPlace()), out_data,
+        out.mutable_rows()->CUDAMutableData(context.GetPlace()),
+        out.rows().size(), input_width);
+    return out;
+  }
+};
+
+template struct MergeAdd<platform::CUDADeviceContext, float>;
+template struct MergeAdd<platform::CUDADeviceContext, double>;
+template struct MergeAdd<platform::CUDADeviceContext, int>;
+template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
+
+template <typename T, int block_size>
+__global__ void UpdateToTensorKernel(const T* selected_rows,
+                                     const int64_t* rows, const ScatterOps& op,
+                                     T* tensor_out, int64_t row_numel) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  selected_rows += ty * row_numel;
+  tensor_out += rows[ty] * row_numel;
+  // FIXME(typhoonzero): use macro fix the below messy code.
+  switch (op) {
+    case ScatterOps::ASSIGN:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] = selected_rows[index];
+      }
+      break;
+    case ScatterOps::ADD:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] += selected_rows[index];
+      }
+      break;
+    case ScatterOps::SUB:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] -= selected_rows[index];
+      }
+      break;
+    case ScatterOps::SUBBY:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] = selected_rows[index] - tensor_out[index];
+      }
+      break;
+    case ScatterOps::MUL:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] *= selected_rows[index];
+      }
+      break;
+    case ScatterOps::DIV:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] /= selected_rows[index];
+      }
+      break;
+    case ScatterOps::DIVBY:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] = selected_rows[index] / tensor_out[index];
+      }
+      break;
+  }
+}
+
+template <typename T>
+struct UpdateToTensor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const ScatterOps& op, const framework::SelectedRows& input1,
+                  framework::Tensor* input2) {
+    // NOTE: Use SelectedRowsAddToTensor for better performance
+    //       no additional MergeAdd called.
+    MergeAdd<platform::CUDADeviceContext, T> merge_func;
+    auto merged_in1 = merge_func(context, input1);
+
+    auto in1_height = merged_in1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+
+    auto& in1_value = merged_in1.value();
+    auto& in1_rows = merged_in1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+
+    auto* in1_data = in1_value.template data<T>();
+    auto* in2_data = input2->data<T>();
+
+    dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1);
+    dim3 grid(1, in1_rows.size());
+    UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<<
+        grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(),
+                                              op, in2_data, in1_row_numel);
+  }
+};
+}  // namespace scatter
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..510a9ed8be6336448971de305279f607282f7658
--- /dev/null
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
@@ -0,0 +1,134 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
+
+#define INLINE_FOR2(sizei, sizej)     \
+  for (int64_t i = 0; i < sizei; i++) \
+    for (int64_t j = 0; j < sizej; j++)
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// SelectedRows + SelectedRows will simplely concat value and rows.
+// The real computation happens in dealing with LoDTensor.
+template <typename DeviceContext, typename T>
+struct SelectedRowsAdd {
+  void operator()(const DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::SelectedRows& input2,
+                  framework::SelectedRows* output);
+};
+
+template <typename DeviceContext, typename T>
+struct SelectedRowsAddTensor {
+  void operator()(const DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::Tensor& input2, framework::Tensor* output);
+};
+
+// input2 = input1 + input2
+template <typename DeviceContext, typename T>
+struct SelectedRowsAddTo {
+  void operator()(const DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const int64_t input2_offset, framework::SelectedRows* input2);
+};
+
+// input2 = input1 + input2
+template <typename DeviceContext, typename T>
+struct SelectedRowsAddToTensor {
+  void operator()(const DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  framework::Tensor* input2);
+};
+
+namespace scatter {
+// functors for manuplating SelectedRows data
+template <typename DeviceContext, typename T>
+struct MergeAdd {
+  // unary functor, merge by adding duplicated rows in
+  // the input SelectedRows object.
+  framework::SelectedRows operator()(const DeviceContext& context,
+                                     const framework::SelectedRows& input);
+};
+
+template <typename DeviceContext, typename T>
+struct Add {
+  framework::SelectedRows operator()(const DeviceContext& context,
+                                     const framework::SelectedRows& input1,
+                                     const framework::SelectedRows& input2) {
+    framework::SelectedRows out;
+    out.set_rows(input1.rows());
+    out.set_height(input1.height());
+    out.mutable_value()->mutable_data<T>(input1.value().dims(),
+                                         context.GetPlace());
+    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
+    e_out.device(*context.eigen_device()) = e_in1 + e_in2;
+    return out;
+  }
+};
+
+template <typename DeviceContext, typename T>
+struct Mul {
+  // multiply two SelectedRows
+  framework::SelectedRows operator()(const DeviceContext& context,
+                                     const framework::SelectedRows& input1,
+                                     const framework::SelectedRows& input2) {
+    framework::SelectedRows out;
+    out.set_rows(input1.rows());
+    out.set_height(input1.height());
+    out.mutable_value()->mutable_data<T>(input1.value().dims(),
+                                         context.GetPlace());
+    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
+    e_out.device(*context.eigen_device()) = e_in1 * e_in2;
+    return out;
+  }
+  // multiply scalar to SelectedRows
+  framework::SelectedRows operator()(const DeviceContext& context,
+                                     const framework::SelectedRows& input1,
+                                     const T input2) {
+    framework::SelectedRows out;
+    out.set_rows(input1.rows());
+    out.set_height(input1.height());
+    out.mutable_value()->mutable_data<T>(input1.value().dims(),
+                                         context.GetPlace());
+    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+    e_out.device(*context.eigen_device()) = input2 * e_in1;
+    return out;
+  }
+};
+
+enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
+
+// out = seleted_rows_in / tensor
+template <typename DeviceContext, typename T>
+struct UpdateToTensor {
+  void operator()(const DeviceContext& context, const ScatterOps& op,
+                  const framework::SelectedRows& input1,
+                  framework::Tensor* input2);
+};
+
+}  // namespace scatter
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..db6b41cd52049a32239eca1a39cd730c11ddc2d8
--- /dev/null
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -0,0 +1,194 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+TEST(selected_rows_functor, cpu_add) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators::math;
+
+  CPUPlace cpu_place;
+  CPUDeviceContext ctx(cpu_place);
+  SetConstant<CPUDeviceContext, float> functor;
+  int64_t height = 10;
+  int64_t row_numel = 10;
+
+  std::vector<int64_t> rows1{0, 4, 7};
+  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
+  auto* in1_value = selected_rows1->mutable_value();
+  in1_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), cpu_place);
+  functor(ctx, in1_value, 1.0);
+
+  std::vector<int64_t> rows2{0, 5, 7, 9};
+  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
+  auto* in2_value = selected_rows2->mutable_value();
+  in2_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), cpu_place);
+  functor(ctx, in2_value, 2.0);
+
+  std::unique_ptr<SelectedRows> output{new SelectedRows()};
+  auto* out_value = output->mutable_value();
+
+  // simplely concat two SelectedRows
+  out_value->mutable_data<float>(make_ddim({7, 10}), cpu_place);
+
+  SelectedRowsAdd<CPUDeviceContext, float> add_functor;
+  add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
+
+  auto out_height = output->height();
+  EXPECT_EQ(out_height, height);
+
+  auto& out_rows = output->rows();
+
+  // input1 rows
+  EXPECT_EQ(out_rows[0], 0);
+  EXPECT_EQ(out_rows[1], 4);
+  EXPECT_EQ(out_rows[2], 7);
+  // input2 rows
+  EXPECT_EQ(out_rows[3], 0);
+  EXPECT_EQ(out_rows[4], 5);
+  EXPECT_EQ(out_rows[5], 7);
+  EXPECT_EQ(out_rows[6], 9);
+
+  auto* out_data = output->value().data<float>();
+  // input1 value
+  EXPECT_EQ(out_data[0 * row_numel + 0], 1.0);
+  EXPECT_EQ(out_data[0 * row_numel + 8], 1.0);
+  EXPECT_EQ(out_data[1 * row_numel + 1], 1.0);
+  EXPECT_EQ(out_data[2 * row_numel + 6], 1.0);
+  // input2 value
+  EXPECT_EQ(out_data[3 * row_numel + 3], 2.0);
+  EXPECT_EQ(out_data[3 * row_numel + 8], 2.0);
+  EXPECT_EQ(out_data[4 * row_numel + 4], 2.0);
+  EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
+  EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
+
+  std::unique_ptr<Tensor> tensor1{new Tensor()};
+  tensor1->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
+  functor(ctx, tensor1.get(), 3.0);
+
+  std::unique_ptr<Tensor> tensor2{new Tensor()};
+  tensor2->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
+
+  SelectedRowsAddTensor<CPUDeviceContext, float> add_tensor_functor;
+  add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
+
+  auto* tensor2_data = tensor2->data<float>();
+  // row0: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor2_data[0 * row_numel + 0], 6.0);
+  // row1: 3.0
+  EXPECT_EQ(tensor2_data[1 * row_numel + 1], 3.0);
+  // row4 : 1.0 + 3.0
+  EXPECT_EQ(tensor2_data[4 * row_numel + 6], 4.0);
+  // row5: 2.0 + 3.0
+  EXPECT_EQ(tensor2_data[5 * row_numel + 7], 5.0);
+  // row6: 3.0
+  EXPECT_EQ(tensor2_data[6 * row_numel + 1], 3.0);
+  // row7: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor2_data[7 * row_numel + 3], 6.0);
+  // row9: 2.0 + 3.0
+  EXPECT_EQ(tensor2_data[9 * row_numel + 6], 5.0);
+}
+
+TEST(selected_rows_functor, cpu_add_to) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators::math;
+
+  CPUPlace cpu_place;
+  CPUDeviceContext ctx(cpu_place);
+  SetConstant<CPUDeviceContext, float> functor;
+  int64_t height = 10;
+  int64_t row_numel = 10;
+
+  std::vector<int64_t> rows1{0, 4, 7};
+  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
+  auto* in1_value = selected_rows1->mutable_value();
+  in1_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), cpu_place);
+  functor(ctx, in1_value, 1.0);
+
+  std::vector<int64_t> rows2{0, 5, 7, 9};
+  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
+  auto* in2_value = selected_rows2->mutable_value();
+  in2_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), cpu_place);
+  functor(ctx, in2_value, 2.0);
+
+  std::unique_ptr<SelectedRows> output{new SelectedRows()};
+  output->set_height(height);
+  auto* out_value = output->mutable_value();
+
+  // simplely concat two SelectedRows
+  out_value->mutable_data<float>(make_ddim({7, 10}), cpu_place);
+
+  SelectedRowsAddTo<CPUDeviceContext, float> add_to_functor;
+  add_to_functor(ctx, *selected_rows1, 0, output.get());
+  add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
+
+  auto out_height = output->height();
+  EXPECT_EQ(out_height, height);
+
+  auto& out_rows = output->rows();
+
+  // input1 rows
+  EXPECT_EQ(out_rows[0], 0);
+  EXPECT_EQ(out_rows[1], 4);
+  EXPECT_EQ(out_rows[2], 7);
+  // input2 rows
+  EXPECT_EQ(out_rows[3], 0);
+  EXPECT_EQ(out_rows[4], 5);
+  EXPECT_EQ(out_rows[5], 7);
+  EXPECT_EQ(out_rows[6], 9);
+
+  auto* out_data = output->value().data<float>();
+  // input1 value
+  EXPECT_EQ(out_data[0 * row_numel + 0], 1.0);
+  EXPECT_EQ(out_data[0 * row_numel + 8], 1.0);
+  EXPECT_EQ(out_data[1 * row_numel + 1], 1.0);
+  EXPECT_EQ(out_data[2 * row_numel + 6], 1.0);
+  // input2 value
+  EXPECT_EQ(out_data[3 * row_numel + 3], 2.0);
+  EXPECT_EQ(out_data[3 * row_numel + 8], 2.0);
+  EXPECT_EQ(out_data[4 * row_numel + 4], 2.0);
+  EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
+  EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
+
+  std::unique_ptr<Tensor> tensor1{new Tensor()};
+  tensor1->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
+  functor(ctx, tensor1.get(), 3.0);
+
+  SelectedRowsAddToTensor<CPUDeviceContext, float> add_to_tensor_functor;
+  add_to_tensor_functor(ctx, *output, tensor1.get());
+
+  auto* tensor1_data = tensor1->data<float>();
+  // row0: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor1_data[0 * row_numel + 0], 6.0);
+  // row1: 3.0
+  EXPECT_EQ(tensor1_data[1 * row_numel + 1], 3.0);
+  // row4 : 1.0 + 3.0
+  EXPECT_EQ(tensor1_data[4 * row_numel + 6], 4.0);
+  // row5: 2.0 + 3.0
+  EXPECT_EQ(tensor1_data[5 * row_numel + 7], 5.0);
+  // row6: 3.0
+  EXPECT_EQ(tensor1_data[6 * row_numel + 1], 3.0);
+  // row7: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor1_data[7 * row_numel + 3], 6.0);
+  // row9: 2.0 + 3.0
+  EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0);
+}
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu b/paddle/fluid/operators/math/selected_rows_functor_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b3c4bc9244f9ca1771c5f435788cf3789d7c4574
--- /dev/null
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu
@@ -0,0 +1,212 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+TEST(selected_rows_functor, gpu_add) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators::math;
+
+  CUDAPlace gpu_place(0);
+  CPUPlace cpu_place;
+  CUDADeviceContext ctx(gpu_place);
+  SetConstant<CUDADeviceContext, float> functor;
+  int64_t height = 10;
+  int64_t row_numel = 10;
+
+  std::vector<int64_t> rows1{0, 4, 7};
+  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
+  auto* in1_value = selected_rows1->mutable_value();
+  in1_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), gpu_place);
+  functor(ctx, in1_value, 1.0);
+
+  std::vector<int64_t> rows2{0, 5, 7, 9};
+  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
+  auto* in2_value = selected_rows2->mutable_value();
+  in2_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), gpu_place);
+  functor(ctx, in2_value, 2.0);
+
+  std::unique_ptr<SelectedRows> output{new SelectedRows()};
+  auto* out_value = output->mutable_value();
+
+  // simplely concat two SelectedRows
+  out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place);
+
+  SelectedRowsAdd<CUDADeviceContext, float> add_functor;
+  add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
+
+  auto out_height = output->height();
+  EXPECT_EQ(out_height, height);
+
+  auto& out_rows = output->rows();
+
+  // input1 rows
+  EXPECT_EQ(out_rows[0], 0);
+  EXPECT_EQ(out_rows[1], 4);
+  EXPECT_EQ(out_rows[2], 7);
+  // input2 rows
+  EXPECT_EQ(out_rows[3], 0);
+  EXPECT_EQ(out_rows[4], 5);
+  EXPECT_EQ(out_rows[5], 7);
+  EXPECT_EQ(out_rows[6], 9);
+
+  Tensor out_cpu;
+  Copy(*out_value, cpu_place, ctx, &out_cpu);
+  ctx.Wait();
+
+  auto* out_cpu_data = out_cpu.data<float>();
+  // input1 value
+  EXPECT_EQ(out_cpu_data[0 * row_numel + 0], 1.0);
+  EXPECT_EQ(out_cpu_data[0 * row_numel + 8], 1.0);
+  EXPECT_EQ(out_cpu_data[1 * row_numel + 1], 1.0);
+  EXPECT_EQ(out_cpu_data[2 * row_numel + 6], 1.0);
+  // input2 value
+  EXPECT_EQ(out_cpu_data[3 * row_numel + 3], 2.0);
+  EXPECT_EQ(out_cpu_data[3 * row_numel + 8], 2.0);
+  EXPECT_EQ(out_cpu_data[4 * row_numel + 4], 2.0);
+  EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0);
+  EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
+
+  std::unique_ptr<Tensor> tensor1{new Tensor()};
+  tensor1->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
+  functor(ctx, tensor1.get(), 3.0);
+
+  std::unique_ptr<Tensor> tensor2{new Tensor()};
+  tensor2->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
+
+  SelectedRowsAddTensor<CUDADeviceContext, float> add_tensor_functor;
+  add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
+
+  Tensor tensor2_cpu;
+  Copy(*tensor2, cpu_place, ctx, &tensor2_cpu);
+  ctx.Wait();
+
+  auto* tensor2_cpu_data = tensor2_cpu.data<float>();
+  // row0: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor2_cpu_data[0 * row_numel + 0], 6.0);
+  // row1: 3.0
+  EXPECT_EQ(tensor2_cpu_data[1 * row_numel + 1], 3.0);
+  // row4 : 1.0 + 3.0
+  EXPECT_EQ(tensor2_cpu_data[4 * row_numel + 6], 4.0);
+  // row5: 2.0 + 3.0
+  EXPECT_EQ(tensor2_cpu_data[5 * row_numel + 7], 5.0);
+  // row6: 3.0
+  EXPECT_EQ(tensor2_cpu_data[6 * row_numel + 1], 3.0);
+  // row7: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor2_cpu_data[7 * row_numel + 3], 6.0);
+  // row9: 2.0 + 3.0
+  EXPECT_EQ(tensor2_cpu_data[9 * row_numel + 6], 5.0);
+}
+
+TEST(selected_rows_functor, gpu_add_to) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators::math;
+
+  CUDAPlace gpu_place(0);
+  CPUPlace cpu_place;
+  CUDADeviceContext ctx(gpu_place);
+  SetConstant<CUDADeviceContext, float> functor;
+  int64_t height = 10;
+  int64_t row_numel = 10;
+
+  std::vector<int64_t> rows1{0, 4, 7};
+  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
+  auto* in1_value = selected_rows1->mutable_value();
+  in1_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), gpu_place);
+  functor(ctx, in1_value, 1.0);
+
+  std::vector<int64_t> rows2{0, 5, 7, 9};
+  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
+  auto* in2_value = selected_rows2->mutable_value();
+  in2_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), gpu_place);
+  functor(ctx, in2_value, 2.0);
+
+  std::unique_ptr<SelectedRows> output{new SelectedRows()};
+  output->set_height(height);
+  auto* out_value = output->mutable_value();
+
+  // simplely concat two SelectedRows
+  out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place);
+
+  SelectedRowsAddTo<CUDADeviceContext, float> add_to_functor;
+  add_to_functor(ctx, *selected_rows1, 0, output.get());
+  add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
+
+  auto out_height = output->height();
+  EXPECT_EQ(out_height, height);
+
+  auto& out_rows = output->rows();
+
+  // input1 rows
+  EXPECT_EQ(out_rows[0], 0);
+  EXPECT_EQ(out_rows[1], 4);
+  EXPECT_EQ(out_rows[2], 7);
+  // input2 rows
+  EXPECT_EQ(out_rows[3], 0);
+  EXPECT_EQ(out_rows[4], 5);
+  EXPECT_EQ(out_rows[5], 7);
+  EXPECT_EQ(out_rows[6], 9);
+
+  Tensor out_cpu;
+  Copy(*out_value, cpu_place, ctx, &out_cpu);
+  ctx.Wait();
+
+  auto* out_cpu_data = out_cpu.data<float>();
+  // input1 value
+  EXPECT_EQ(out_cpu_data[0 * row_numel + 0], 1.0);
+  EXPECT_EQ(out_cpu_data[0 * row_numel + 8], 1.0);
+  EXPECT_EQ(out_cpu_data[1 * row_numel + 1], 1.0);
+  EXPECT_EQ(out_cpu_data[2 * row_numel + 6], 1.0);
+  // input2 value
+  EXPECT_EQ(out_cpu_data[3 * row_numel + 3], 2.0);
+  EXPECT_EQ(out_cpu_data[3 * row_numel + 8], 2.0);
+  EXPECT_EQ(out_cpu_data[4 * row_numel + 4], 2.0);
+  EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0);
+  EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
+
+  std::unique_ptr<Tensor> tensor1{new Tensor()};
+  tensor1->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
+  functor(ctx, tensor1.get(), 3.0);
+
+  SelectedRowsAddToTensor<CUDADeviceContext, float> add_to_tensor_functor;
+  add_to_tensor_functor(ctx, *output, tensor1.get());
+
+  Tensor tensor1_cpu;
+  Copy(*tensor1, cpu_place, ctx, &tensor1_cpu);
+  ctx.Wait();
+
+  auto* tensor1_cpu_data = tensor1_cpu.data<float>();
+  // row0: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor1_cpu_data[0 * row_numel + 0], 6.0);
+  // row1: 3.0
+  EXPECT_EQ(tensor1_cpu_data[1 * row_numel + 1], 3.0);
+  // row4 : 1.0 + 3.0
+  EXPECT_EQ(tensor1_cpu_data[4 * row_numel + 6], 4.0);
+  // row5: 2.0 + 3.0
+  EXPECT_EQ(tensor1_cpu_data[5 * row_numel + 7], 5.0);
+  // row6: 3.0
+  EXPECT_EQ(tensor1_cpu_data[6 * row_numel + 1], 3.0);
+  // row7: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor1_cpu_data[7 * row_numel + 3], 6.0);
+  // row9: 2.0 + 3.0
+  EXPECT_EQ(tensor1_cpu_data[9 * row_numel + 6], 5.0);
+}
diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/fluid/operators/math/sequence2batch.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0485070fd9b722bdf9011452b2545e065f46d2ac
--- /dev/null
+++ b/paddle/fluid/operators/math/sequence2batch.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& src,
+                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
+                  bool is_src_index) {
+    size_t* index = index_lod.data();
+    auto src_dims = src.dims();
+    auto dst_dims = dst.dims();
+    PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
+                      "The src must be matrix with rank 2.");
+    PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL,
+                      "The dst must be matrix with rank 2.");
+    PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
+                      "The width of src and dst must be same.");
+    auto height = dst_dims[0];
+    auto width = dst_dims[1];
+    auto* src_data = src.data<T>();
+    auto* dst_data = dst.data<T>();
+    for (int i = 0; i < height; ++i) {
+      if (is_src_index) {
+        memcpy(dst_data + i * width, src_data + index[i] * width,
+               width * sizeof(T));
+      } else {
+        memcpy(dst_data + index[i] * width, src_data + i * width,
+               width * sizeof(T));
+      }
+    }
+  }
+};
+
+template class CopyMatrixRowsFunctor<platform::CPUDeviceContext, float>;
+template class CopyMatrixRowsFunctor<platform::CPUDeviceContext, double>;
+
+template class LoDTensor2BatchFunctor<platform::CPUDeviceContext, float>;
+template class LoDTensor2BatchFunctor<platform::CPUDeviceContext, double>;
+template class Batch2LoDTensorFunctor<platform::CPUDeviceContext, float>;
+template class Batch2LoDTensorFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence2batch.cu b/paddle/fluid/operators/math/sequence2batch.cu
new file mode 100644
index 0000000000000000000000000000000000000000..450be80ea2fe67aa0e537f06e15f07b38c5751ea
--- /dev/null
+++ b/paddle/fluid/operators/math/sequence2batch.cu
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/math/sequence2batch.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
+__global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index,
+                                     int64_t height, int64_t width,
+                                     bool is_src_index) {
+  int idx = threadIdx.x;
+  int idy = threadIdx.y;
+  int id = blockIdx.x + idy * GridDimX;
+  while (id < height) {
+    int src_idx = is_src_index ? index[id] : id;
+    int dst_idx = is_src_index ? id : index[id];
+    const T* src_data = src + src_idx * width;
+    T* dst_data = dst + dst_idx * width;
+    for (int i = idx; i < width; i += BlockDimX) {
+      dst_data[i] = src_data[i];
+    }
+    id += BlockDimY * GridDimX;
+  }
+}
+
+template <typename T>
+class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& src,
+                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
+                  bool is_src_index) {
+    auto src_dims = src.dims();
+    auto dst_dims = dst.dims();
+    PADDLE_ENFORCE_EQ(src_dims.size(), 2,
+                      "The src must be matrix with rank 2.");
+    PADDLE_ENFORCE_EQ(dst_dims.size(), 2,
+                      "The dst must be matrix with rank 2.");
+    PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
+                      "The width of src and dst must be same.");
+    auto height = dst_dims[0];
+    auto width = dst_dims[1];
+    auto* src_data = src.data<T>();
+    auto* dst_data = dst.data<T>();
+
+    dim3 threads(128, 8);
+    dim3 grid(8, 1);
+    auto stream = context.stream();
+    CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>(
+        src_data, dst_data, index_lod.CUDAData(context.GetPlace()), height,
+        width, is_src_index);
+  }
+};
+
+template class CopyMatrixRowsFunctor<platform::CUDADeviceContext, float>;
+template class CopyMatrixRowsFunctor<platform::CUDADeviceContext, double>;
+
+template class LoDTensor2BatchFunctor<platform::CUDADeviceContext, float>;
+template class LoDTensor2BatchFunctor<platform::CUDADeviceContext, double>;
+template class Batch2LoDTensorFunctor<platform::CUDADeviceContext, float>;
+template class Batch2LoDTensorFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/fluid/operators/math/sequence2batch.h
new file mode 100644
index 0000000000000000000000000000000000000000..00bd25ab613b198e539368f3233d71618dfc758f
--- /dev/null
+++ b/paddle/fluid/operators/math/sequence2batch.h
@@ -0,0 +1,168 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class CopyMatrixRowsFunctor {
+ public:
+  // If is_src_index is true,
+  // copy the indexed rows of input src to the output dst.
+  // If is_src_index is false,
+  // copy the input src to the indexed rows of output dst.
+  // The indexed rows are based on the input index.
+  void operator()(const DeviceContext& context, const framework::Tensor& src,
+                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
+                  bool is_src_index);
+};
+
+template <typename DeviceContext, typename T>
+class LoDTensor2BatchFunctor {
+  // Calculate the length of each sequence and
+  // sort sequence index by the length.
+  // example:  sequences = {s0, s1, s2}
+  //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+  //           seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
+  //
+  struct SeqInfo {
+    SeqInfo(int start, int length, int seq_idx)
+        : start(start), length(length), seq_idx(seq_idx) {}
+    int start;
+    int length;
+    int seq_idx;
+  };
+
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::LoDTensor& lod_tensor,
+                  framework::LoDTensor& batch, bool is_cal_batch_lod,
+                  bool is_reverse = false) const {
+    if (!is_cal_batch_lod) {
+      auto lods = batch.lod();
+      PADDLE_ENFORCE_GT(lods.size(), 2UL);
+      PADDLE_ENFORCE_EQ(lods[1].size(),
+                        static_cast<size_t>(lod_tensor.dims()[0]));
+      CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
+      to_batch(context, lod_tensor, lods[1], batch, true);
+      return;
+    }
+
+    auto lods = lod_tensor.lod();
+    auto lod = lods[0];
+    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
+
+    std::vector<SeqInfo> seq_info;
+    for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
+      int length = lod[seq_id + 1] - lod[seq_id];
+      seq_info.emplace_back(lod[seq_id], length, seq_id);
+    }
+
+    std::sort(seq_info.begin(), seq_info.end(),
+              [](SeqInfo a, SeqInfo b) { return a.length > b.length; });
+
+    // Calculate the start position of each batch.
+    // example:  sequences = {s0, s1, s2}
+    //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+    //           num_batch = 5,
+    //           batchIndex = {b0, b1, b2, b3, b4}
+    //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
+    //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
+    //              batch_start_positions[0] = len(b0)
+    //              batch_start_positions[1] = len(b0) + len(b1)
+    //              batch_start_positions[2] = len(b0) + len(b1) + len(b2)
+    //              ...
+    //           seq2batch_idx[12] = {4, 0, 9,
+    //                                5, 1, 10,
+    //                                6, 2, 11,
+    //                                7, 3,
+    //                                8}
+    //           seq_order = {1, 0, 2}, the sort order.
+    //               where 1 is the second sequence,
+    //                     0 is the first sequence,
+    //                     2 is the third sequence.
+    // The num_batch represents batch size after rearranging the
+    // input LodTensor. It is also the maximum length of input sequence.
+
+    paddle::framework::LoD batch_lods;
+    batch_lods.emplace_back(std::vector<size_t>{0});
+    batch_lods.emplace_back(std::vector<size_t>{0});
+    batch_lods.emplace_back(std::vector<size_t>{0});
+
+    // batch_lods[0] is the start positions for batch LoDTensor
+    int num_batch = seq_info[0].length;
+    batch_lods[0].resize(static_cast<size_t>(num_batch + 1));
+    // batch_lods[1] is the raw index in the input LoDTensor
+    batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
+    // batch_lods[2] is the sort order for the input LoDTensor.
+    batch_lods[2].resize(seq_info.size());
+
+    size_t* batch_starts = batch_lods[0].data();
+    size_t* seq2batch_idx = batch_lods[1].data();
+    batch_starts[0] = 0;
+    for (int n = 0; n < num_batch; n++) {
+      auto batch_id = static_cast<int>(batch_starts[n]);
+      for (size_t i = 0; i < seq_info.size(); ++i) {
+        int seq_len = seq_info[i].length;
+        int start = seq_info[i].start;
+        if (n < seq_len) {
+          seq2batch_idx[batch_id] =
+              is_reverse ? start + seq_len - 1 - n : start + n;
+          batch_id++;
+        } else {
+          break;
+        }
+      }
+      batch_starts[n + 1] = static_cast<size_t>(batch_id);
+    }
+    size_t* seq_order = batch_lods[2].data();
+    for (size_t i = 0; i < seq_info.size(); ++i) {
+      seq_order[i] = seq_info[i].seq_idx;
+    }
+    batch.set_lod(batch_lods);
+
+    CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
+    to_batch(context, lod_tensor, batch_lods[1], batch, true);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class Batch2LoDTensorFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::LoDTensor& batch,
+                  framework::LoDTensor& lod_tensor) const {
+    auto in_lod = batch.lod();
+    PADDLE_ENFORCE_GT(in_lod.size(), 2UL);
+    PADDLE_ENFORCE_EQ(in_lod[1].size(),
+                      static_cast<size_t>(lod_tensor.dims()[0]));
+    CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
+    to_seq(context, batch, in_lod[1], lod_tensor, false);
+  }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ad8cd825676c77cc204bbb02f88f422d945f1a2c
--- /dev/null
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -0,0 +1,146 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/sequence_padding.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::LoDTensor& seq, framework::Tensor& padding,
+                  bool norm_by_times) {
+    auto lod = seq.lod();
+    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                      "The LoD of LoDTensor seq should not be null.");
+
+    const size_t level = 0;
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    auto seq_dims = seq.dims();
+    PADDLE_ENFORCE_EQ(seq_dims[0],
+                      static_cast<int64_t>(abs_offset_lod[level].back()),
+                      "The first dimension of LoDTensor seq should be "
+                      "equal to the sum of all sequences's length.");
+
+    auto padding_dims = padding.dims();
+    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
+                      "The input padding should be a 3-D Tensor of shape "
+                      "[max_sequence_length, num_sequences, sequence_width].");
+
+    const int64_t max_sequence_length = MaximumSequenceLength(lod, level);
+    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
+                      "The first dimension of Tensor padding should be the "
+                      "maximum length of all sequences in LoDTensor seq.");
+
+    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
+                      "The second dimension of Tensor padding should be the "
+                      "number of sequences in LoDTensor seq.");
+
+    const int64_t sequence_width = seq.numel() / seq_dims[0];
+    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
+                      "The third dimension of Tensor padding should be the "
+                      "width of sequence in LoDTensor seq.");
+
+    const T* seq_data = seq.data<T>();
+    T* padding_data = padding.data<T>();
+    for (int64_t i = 0; i < max_sequence_length; ++i) {
+      for (int64_t j = 0; j < num_sequences; ++j) {
+        int64_t start_pos = abs_offset_lod[level][j];
+        int64_t sequence_length = abs_offset_lod[level][j + 1] - start_pos;
+        if (i < sequence_length) {
+          // i > 0 => sequence_length > 0
+          T scale =
+              norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
+          for (int64_t k = 0; k < sequence_width; ++k) {
+            padding_data[(i * num_sequences + j) * sequence_width + k] =
+                seq_data[(start_pos + i) * sequence_width + k] * scale;
+          }
+        } else {
+          memset(padding_data + (i * num_sequences + j) * sequence_width, 0,
+                 sequence_width * sizeof(T));
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  framework::LoDTensor& seq, const framework::Tensor& padding,
+                  bool norm_by_times) {
+    auto lod = seq.lod();
+    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                      "The LoD of LoDTensor seq should not be null.");
+
+    const size_t level = 0;
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    auto seq_dims = seq.dims();
+    PADDLE_ENFORCE_EQ(seq_dims[0],
+                      static_cast<int64_t>(abs_offset_lod[level].back()),
+                      "The first dimension of LoDTensor seq should be "
+                      "equal to the sum of all sequences's length.");
+
+    auto padding_dims = padding.dims();
+    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
+                      "The input padding should be a 3-D Tensor of shape "
+                      "[max_sequnece_length, num_sequences, sequence_width].");
+
+    const int64_t max_sequence_length = MaximumSequenceLength(lod, level);
+    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
+                      "The first dimension of Tensor padding should be "
+                      "the maximum length of all sequences in LoDTensor seq.");
+
+    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
+                      "The second dimension of Tensor padding should be "
+                      "the number of sequences in LoDTensor seq.");
+
+    const int64_t sequence_width = seq.numel() / seq_dims[0];
+    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
+                      "The third dimension of Tensor padding should be the "
+                      "width of sequence in LoDTensor seq.");
+
+    const T* padding_data = padding.data<T>();
+    T* seq_data = seq.data<T>();
+    for (int64_t i = 0; i < num_sequences; ++i) {
+      int64_t start_pos = abs_offset_lod[level][i];
+      int64_t sequence_length = abs_offset_lod[level][i + 1] - start_pos;
+      for (int64_t j = 0; j < sequence_length; ++j) {
+        // sequence_width > j > 0
+        T scale =
+            norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
+        for (int64_t k = 0; k < sequence_width; ++k) {
+          seq_data[(start_pos + j) * sequence_width + k] =
+              padding_data[(j * num_sequences + i) * sequence_width + k] *
+              scale;
+        }
+      }
+    }
+  }
+};
+
+template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
+template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c1a390577840db2424185da19f5a5d2b231f25b6
--- /dev/null
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -0,0 +1,215 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/sequence_padding.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, bool NormByTimes, bool Padding>
+__global__ void SequencePaddingKernel(T* padding, T* sequence,
+                                      const size_t* sequence_start_positions,
+                                      const size_t sequence_width,
+                                      const size_t max_sequence_length,
+                                      const size_t num_sequences) {
+  size_t padding_idx = blockIdx.y;
+  size_t start_pos = sequence_start_positions[padding_idx];
+  size_t sequence_length =
+      sequence_start_positions[padding_idx + 1] - start_pos;
+
+  size_t sequence_idx = blockIdx.x * blockDim.y + threadIdx.y;
+  size_t padding_base_idx =
+      (sequence_idx * num_sequences + padding_idx) * sequence_width;
+  size_t sequence_base_idx = (start_pos + sequence_idx) * sequence_width;
+
+  if (sequence_idx < sequence_length) {
+    T scale = NormByTimes ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
+    if (Padding) {
+      /* sequence -> padding */
+      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
+        padding[padding_base_idx + i] = scale * sequence[sequence_base_idx + i];
+      }
+    } else {
+      /* padding -> sequence */
+      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
+        sequence[sequence_base_idx + i] = scale * padding[padding_base_idx + i];
+      }
+    }
+  } else if (sequence_idx < max_sequence_length) {
+    if (Padding) {
+      /* sequence -> padding */
+      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
+        padding[padding_base_idx + i] = 0;
+      }
+    }
+  }
+}
+
+template <typename T>
+class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::LoDTensor& seq, framework::Tensor& padding,
+                  bool norm_by_times) {
+    auto lod = seq.lod();
+    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                      "The lod of LoDTensor seq should not be null.");
+
+    const size_t level = 0;
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    auto seq_dims = seq.dims();
+    PADDLE_ENFORCE_EQ(seq_dims[0],
+                      static_cast<int64_t>(abs_offset_lod[level].back()),
+                      "The first dimension of LoDTensor seq should be "
+                      "equal to the sum of all sequences's length.");
+
+    auto padding_dims = padding.dims();
+    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
+                      "The input padding should be a 3-D Tensor of shape "
+                      "[max_sequence_length, num_sequences, sequence_width].");
+
+    int64_t max_sequence_length = MaximumSequenceLength(lod, level);
+    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
+                      "The first dimension of Tensor padding should be the "
+                      "maximum length of all sequences in LoDTensor seq.");
+
+    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
+                      "The second dimension of Tensor padding should be the "
+                      "number of sequences in LoDTensor seq.");
+
+    const int64_t sequence_width = seq.numel() / seq_dims[0];
+    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
+                      "The third dimension of Tensor padding should be the "
+                      "width of sequence in LoDTensor seq.");
+
+    if (!norm_by_times && num_sequences == 1UL) {
+      Copy(seq, context.GetPlace(), context, &padding);
+      padding.Resize(padding_dims);
+      return;
+    }
+
+    const int64_t kBlockSize = 512;
+
+    /* At least use 32 threads to copy sequence_width elements,
+     * and at least 8 elements for each thread.
+     */
+    size_t block_dim_x =
+        std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
+    size_t block_dim_y = kBlockSize / block_dim_x;
+    dim3 threads(block_dim_x, block_dim_y);
+
+    size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y;
+    size_t grid_dim_y = num_sequences;
+    dim3 grid(grid_dim_x, grid_dim_y);
+
+    const T* seq_data = seq.data<T>();
+    T* padding_data = padding.data<T>();
+    if (norm_by_times) {
+      SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
+          padding_data, const_cast<T*>(seq_data),
+          abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
+          max_sequence_length, num_sequences);
+    } else {
+      SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
+          padding_data, const_cast<T*>(seq_data),
+          abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
+          max_sequence_length, num_sequences);
+    }
+  }
+};
+
+template <typename T>
+class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  framework::LoDTensor& seq, const framework::Tensor& padding,
+                  bool norm_by_times) {
+    auto lod = seq.lod();
+    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                      "The lod of LoDTensor seq should not be null.");
+
+    const size_t level = 0;
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    auto seq_dims = seq.dims();
+    PADDLE_ENFORCE_EQ(seq_dims[0],
+                      static_cast<int64_t>(abs_offset_lod[level].back()),
+                      "The first dimension of LoDTensor seq should be "
+                      "equal to the sum of all sequences's length.");
+
+    auto padding_dims = padding.dims();
+    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
+                      "The input padding should be a 3-D Tensor of shape "
+                      "[max_sequnece_length, num_sequences, sequence_width].");
+
+    int64_t max_sequence_length = MaximumSequenceLength(lod, level);
+    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
+                      "The first dimension of Tensor padding should be "
+                      "the maximum length of all sequences in LoDTensor seq.");
+
+    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
+                      "The second dimension of Tensor padding should be "
+                      "the number of sequences in LoDTensor seq.");
+
+    const int64_t sequence_width = seq.numel() / seq_dims[0];
+    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
+                      "The third dimension of Tensor padding should be the "
+                      "width of sequence in LoDTensor seq.");
+
+    if (!norm_by_times && num_sequences == 1UL) {
+      Copy(padding, context.GetPlace(), context, &seq);
+      seq.Resize(seq_dims);
+      return;
+    }
+
+    const int64_t kBlockSize = 512;
+
+    /* At least use 32 threads to copy sequence_width elements,
+     * and at least 8 elements for each thread.
+     */
+    size_t block_dim_x =
+        std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
+    size_t block_dim_y = kBlockSize / block_dim_x;
+    dim3 threads(block_dim_x, block_dim_y);
+
+    size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y;
+    size_t grid_dim_y = num_sequences;
+    dim3 grid(grid_dim_x, grid_dim_y);
+
+    const T* padding_data = padding.data<T>();
+    T* seq_data = seq.data<T>();
+    if (norm_by_times) {
+      SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
+          const_cast<T*>(padding_data), seq_data,
+          abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
+          max_sequence_length, num_sequences);
+    } else {
+      SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
+          const_cast<T*>(padding_data), seq_data,
+          abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
+          max_sequence_length, num_sequences);
+    }
+  }
+};
+
+template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
+template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence_padding.h b/paddle/fluid/operators/math/sequence_padding.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d84f9dcb3802d82cd385957f66ffe28269b8dfc
--- /dev/null
+++ b/paddle/fluid/operators/math/sequence_padding.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+inline static size_t MaximumSequenceLength(const framework::LoD& lod,
+                                           const size_t level) {
+  const size_t num_sequences = lod[level].size() - 1;
+  size_t max_sequence_length = 0;
+  framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+  for (size_t i = 0; i < num_sequences; ++i) {
+    max_sequence_length =
+        std::max(max_sequence_length,
+                 abs_offset_lod[level][i + 1] - abs_offset_lod[level][i]);
+  }
+  return max_sequence_length;
+}
+
+/*
+ * \brief   Padding/Unpadding LoDTensor to/from normal Tensor of the shape
+ *          [max_sequence_length, num_sequences, sequence_width].
+ *
+ *  Padding sequence:
+ *        padding[i] = seq[lod[level][i]]
+ *  Unpadding sequence:
+ *        seq[lod[level][i]] = padding[i]
+ *
+ *  All sequences will be padded to the same length and stored in a transposed
+ * shape.
+ *  Example:
+ *    seq     (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
+ *    padding (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0)
+ *
+ * \param context       device context of this functor.
+ * \param seq           LoDTensor which is stored in sequence format, the shape
+ *                      is [total_sequence_length, sequence_width] where
+ *                      total_sequence_length is the sum of all sequences'
+ *                      length.
+ * \param padding       Tensor which is padded to the same length, the shape is
+ *                      [max_sequence_length, num_sequences, sequence_width].
+ * \param norm_by_times whether dividing sequence's length.
+ *
+ * \note  transposition is also done in this functor.
+ */
+template <typename DeviceContext, typename T>
+class PaddingLoDTensorFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::LoDTensor& seq,
+                  framework::Tensor& padding, bool norm_by_times);
+};
+
+template <typename DeviceContext, typename T>
+class UnpaddingLoDTensorFunctor {
+ public:
+  void operator()(const DeviceContext& context, framework::LoDTensor& seq,
+                  const framework::Tensor& padding, bool norm_by_times);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..147cb37da2bbb1bad6fc423b3936a1446e17de15
--- /dev/null
+++ b/paddle/fluid/operators/math/sequence_padding_test.cc
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/sequence_padding.h"
+#include <gtest/gtest.h>
+
+template <typename DeviceContext, typename Place, typename T>
+void TestSequencePadding(const paddle::framework::LoD& lod,
+                         const size_t sequence_width) {
+  paddle::framework::LoDTensor cpu_seq;
+  paddle::framework::LoDTensor cpu_seq_back;
+  paddle::framework::LoDTensor seq;
+  paddle::framework::LoDTensor seq_back;
+  paddle::framework::Tensor padding;
+
+  const size_t level = lod.size() - 1;
+  auto seq_dims =
+      paddle::framework::make_ddim({static_cast<int64_t>(lod[level].back()),
+                                    static_cast<int64_t>(sequence_width)});
+
+  cpu_seq.set_lod(lod);
+  cpu_seq.mutable_data<T>(seq_dims, paddle::platform::CPUPlace());
+  for (int64_t i = 0; i < cpu_seq.numel(); ++i) {
+    cpu_seq.data<T>()[i] = static_cast<T>(i);
+  }
+
+  auto* place = new Place();
+  DeviceContext* context = new DeviceContext(*place);
+  if (paddle::platform::is_cpu_place(*place)) {
+    seq = cpu_seq;
+  } else {
+    Copy(cpu_seq, *place, *context, &seq);
+    seq.set_lod(lod);
+  }
+
+  const size_t max_sequence_length =
+      paddle::operators::math::MaximumSequenceLength(lod, level);
+  const size_t num_sequences = lod[level].size() - 1;
+  auto padding_dims =
+      paddle::framework::make_ddim({static_cast<int64_t>(max_sequence_length),
+                                    static_cast<int64_t>(num_sequences),
+                                    static_cast<int64_t>(sequence_width)});
+  padding.mutable_data<T>(padding_dims, *place);
+  paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()(
+      *context, seq, padding, false);
+
+  seq_back.set_lod(lod);
+  seq_back.mutable_data<T>(seq_dims, *place);
+  paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
+      *context, seq_back, padding, false);
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    cpu_seq_back = seq_back;
+  } else {
+    Copy(seq_back, paddle::platform::CPUPlace(), *context, &cpu_seq_back);
+    cpu_seq_back.set_lod(lod);
+  }
+
+  EXPECT_EQ(cpu_seq.numel(), cpu_seq_back.numel());
+  EXPECT_EQ(cpu_seq.dims(), cpu_seq_back.dims());
+  for (int64_t i = 0; i < cpu_seq.numel(); ++i) {
+    EXPECT_EQ(cpu_seq.data<T>()[i], cpu_seq_back.data<T>()[i]);
+  }
+
+  delete place;
+  delete context;
+};
+
+TEST(Seq2BatchPadding, CPU) {
+  paddle::framework::LoD lod1;
+  lod1.push_back(std::vector<size_t>{0, 10});
+  TestSequencePadding<paddle::platform::CPUDeviceContext,
+                      paddle::platform::CPUPlace, float>(lod1, 16);
+
+  paddle::framework::LoD lod2;
+  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  TestSequencePadding<paddle::platform::CPUDeviceContext,
+                      paddle::platform::CPUPlace, float>(lod2, 128);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(SequencePadding, CUDA) {
+  paddle::framework::LoD lod1;
+  lod1.push_back(std::vector<size_t>{0, 10});
+  TestSequencePadding<paddle::platform::CUDADeviceContext,
+                      paddle::platform::CUDAPlace, float>(lod1, 16);
+
+  paddle::framework::LoD lod2;
+  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  TestSequencePadding<paddle::platform::CUDADeviceContext,
+                      paddle::platform::CUDAPlace, float>(lod2, 128);
+}
+#endif
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b3b87ec93e19c4dcf99fbacf8f882f9f065430de
--- /dev/null
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/sequence_pooling.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class MaxSeqPoolFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto idx_dims = index->dims();
+    PADDLE_ENFORCE_GT(in_dims.size(), 1);
+    PADDLE_ENFORCE_GT(out_dims.size(), 1);
+    for (int64_t i = 1; i < in_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
+
+    auto starts = input.lod()[0];
+    const T* in_data = input.data<T>();
+    T* out_data = output->data<T>();
+    int* max_index = index->data<int>();
+
+    int64_t num_seq = out_dims[0];
+    int64_t dim = output->numel() / num_seq;
+    for (int64_t i = 0; i < num_seq; ++i) {
+      for (int64_t k = 0; k < dim; ++k) {
+        out_data[i * dim + k] = in_data[starts[i] * dim + k];
+        max_index[i * dim + k] = starts[i];
+      }
+      for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
+        for (int64_t k = 0; k < dim; ++k) {
+          if (in_data[j * dim + k] > out_data[i * dim + k]) {
+            out_data[i * dim + k] = in_data[j * dim + k];
+            max_index[i * dim + k] = j;
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& index,
+                  framework::LoDTensor* in_grad) {
+    auto og_dims = out_grad.dims();
+    auto ig_dims = in_grad->dims();
+    auto idx_dims = index.dims();
+    PADDLE_ENFORCE_GT(og_dims.size(), 1);
+    PADDLE_ENFORCE_GT(ig_dims.size(), 1);
+    for (int64_t i = 1; i < og_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
+
+    const T* og_data = out_grad.data<T>();
+    const int* max_index = index.data<int>();
+    T* ig_data = in_grad->data<T>();
+
+    SetConstant<platform::CPUDeviceContext, T> set_zero;
+    set_zero(context, in_grad, static_cast<T>(0.0));
+    int64_t num_seq = og_dims[0];
+    int64_t dim = out_grad.numel() / num_seq;
+    for (int64_t i = 0; i < num_seq; ++i) {
+      for (int64_t j = 0; j < dim; ++j) {
+        int step_id = max_index[i * dim + j];
+        ig_data[step_id * dim + j] = og_data[i * dim + j];
+      }
+    }
+  }
+};
+
+template class MaxSeqPoolFunctor<platform::CPUDeviceContext, float>;
+template class MaxSeqPoolFunctor<platform::CPUDeviceContext, double>;
+template class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c4267e992a78fd7e80a45ca1dcc1d81ca0f6e8f5
--- /dev/null
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence_pooling.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define FLT_MAX __FLT_MAX__
+
+template <typename T>
+__global__ void KeMaxSequencePool(const T* input, const size_t* starts,
+                                  T* output, int* index, int64_t num_seq,
+                                  int64_t dim) {
+  int dim_idx = threadIdx.x;
+  int seq_id = blockIdx.x;
+  if (seq_id >= num_seq) return;
+  size_t start = starts[seq_id];
+  size_t end = starts[seq_id + 1];
+
+  for (int64_t i = dim_idx; i < dim; i += blockDim.x) {
+    T max_val = static_cast<T>(-FLT_MAX);
+    int max_id = -1;
+    for (size_t step_id = start; step_id < end; step_id++) {
+      if (max_val < input[step_id * dim + i]) {
+        max_val = input[step_id * dim + i];
+        max_id = step_id;
+      }
+    }
+    output[seq_id * dim + i] = max_val;
+    index[seq_id * dim + i] = max_id;
+  }
+}
+
+template <typename T>
+class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto idx_dims = index->dims();
+    PADDLE_ENFORCE_GT(in_dims.size(), static_cast<int64_t>(1));
+    PADDLE_ENFORCE_GT(out_dims.size(), 1);
+    for (int64_t i = 1; i < in_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
+
+    auto starts = input.lod()[0];
+    const T* in_data = input.data<T>();
+    T* out_data = output->data<T>();
+    int* max_index = index->data<int>();
+
+    int64_t num_seq = out_dims[0];
+    int64_t dim = output->numel() / num_seq;
+
+    dim3 threads(256, 1);
+    dim3 grid(num_seq, 1);
+    auto stream = context.stream();
+    KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
+        in_data, starts.CUDAData(context.GetPlace()), out_data, max_index,
+        num_seq, dim);
+  }
+};
+
+template <typename T>
+__global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index,
+                                      T* in_grad, int64_t num_seq,
+                                      int64_t dim) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int col_idx = idx % dim;
+  if (idx < num_seq * dim) {
+    int step_id = max_index[idx];
+    in_grad[step_id * dim + col_idx] = out_grad[idx];
+  }
+}
+
+template <typename T>
+class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& index,
+                  framework::LoDTensor* in_grad) {
+    auto og_dims = out_grad.dims();
+    auto idx_dims = index.dims();
+    auto ig_dims = in_grad->dims();
+    PADDLE_ENFORCE_GT(og_dims.size(), static_cast<int64_t>(1));
+    PADDLE_ENFORCE_GT(ig_dims.size(), static_cast<int64_t>(1));
+    for (int64_t i = 1; i < og_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
+
+    const T* og_data = out_grad.data<T>();
+    const int* max_index = index.data<int>();
+    T* ig_data = in_grad->data<T>();
+
+    SetConstant<platform::CUDADeviceContext, T> set_zero;
+    set_zero(context, in_grad, static_cast<T>(0.0));
+    int64_t num_seq = og_dims[0];
+    int64_t dim = out_grad.numel() / num_seq;
+
+    unsigned int blocks = (num_seq * dim + 128 - 1) / 128;
+    dim3 threads(128, 1);
+    dim3 grid(blocks, 1);
+    auto stream = context.stream();
+    KeMaxSequencePoolGrad<T><<<grid, threads, 0, stream>>>(
+        og_data, max_index, ig_data, num_seq, dim);
+  }
+};
+
+template class MaxSeqPoolFunctor<platform::CUDADeviceContext, float>;
+template class MaxSeqPoolFunctor<platform::CUDADeviceContext, double>;
+template class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, float>;
+template class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence_pooling.h b/paddle/fluid/operators/math/sequence_pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ba9cad74b54b3d0835ba5f89f6498f9309875cc
--- /dev/null
+++ b/paddle/fluid/operators/math/sequence_pooling.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define FLT_MAX __FLT_MAX__
+
+template <typename DeviceContext, typename T>
+class MaxSeqPoolFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index);
+};
+
+template <typename DeviceContext, class T>
+class MaxSeqPoolGradFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& index,
+                  framework::LoDTensor* in_grad);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence_scale.cc b/paddle/fluid/operators/math/sequence_scale.cc
new file mode 100644
index 0000000000000000000000000000000000000000..427689b9718db6ed8cbb8525712404c1498faf20
--- /dev/null
+++ b/paddle/fluid/operators/math/sequence_scale.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/sequence_scale.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class ScaleLoDTensorFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  framework::LoDTensor& seq, const T* scales) {
+    const size_t level = 0;
+    auto lod = seq.lod();
+    const size_t num_seq = lod[level].size() - 1;
+    size_t seq_width = seq.dims()[1];
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    T* seq_data = seq.mutable_data<T>(context.GetPlace());
+    for (size_t i = 0; i < num_seq; ++i) {
+      for (size_t j = lod[level][i] * seq_width;
+           j < lod[level][i + 1] * seq_width; ++j) {
+        seq_data[j] *= scales[i];
+      }
+    }
+  }
+};
+
+template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7c081ed7f4547c4e1200ea7554f83696e404d021
--- /dev/null
+++ b/paddle/fluid/operators/math/sequence_scale.cu
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/sequence_scale.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, int BlockSize>
+__global__ void SequenceScaleKernel(T* seq, size_t* lod, const T* scales,
+                                    const size_t seq_width) {
+  for (int i = threadIdx.x;
+       i < (lod[blockIdx.x + 1] - lod[blockIdx.x]) * seq_width;
+       i += BlockSize) {
+    int idx = lod[blockIdx.x] * seq_width + i;
+    seq[idx] *= scales[blockIdx.x];
+  }
+}
+
+template <typename T>
+class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  framework::LoDTensor& seq, const T* scales) {
+    const size_t level = 0;
+    auto lod = seq.lod();
+    const size_t num_seq = lod[level].size() - 1;
+    const size_t seq_width = seq.numel() / seq.dims()[0];
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+    T* seq_data = seq.mutable_data<T>(context.GetPlace());
+
+    SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
+        num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
+        seq_data, abs_offset_lod[level].CUDAMutableData(context.GetPlace()),
+        scales, seq_width);
+  }
+};
+
+template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence_scale.h b/paddle/fluid/operators/math/sequence_scale.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8e07fd3156cc516c904a1d3d510a7c6eed5b8a0
--- /dev/null
+++ b/paddle/fluid/operators/math/sequence_scale.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * \brief   Scale a sequence.
+ *
+ *  All sequences will be padded to the same length and stored in a transposed
+ * shape.
+ *  Example:
+ *    Given:
+ *      seq = (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
+ *      scales = (2, 3, 4, 5)
+ *    then:
+ *      result = (2*s0, 2*s0, 2*s0, 2*s0; 3*s1, 3*s1; 4*s2, 4*s2, 4*s2; 5*s3)
+
+ *
+ * \param context       Device context of this functor.
+ * \param seq           LoDTensor which is stored in sequence format, the shape
+ *                      is [total_sequence_length, sequence_width] where
+ *                      total_sequence_length is the sum of all sequences'
+ *                      length.
+ * \param scales        Array<T>. The i-th sequence will be scaled by scales[i].
+ * \param num_seq       Number of sequence
+ *
+ */
+template <typename DeviceContext, typename T>
+class ScaleLoDTensorFunctor {
+ public:
+  void operator()(const DeviceContext& context, framework::LoDTensor& seq,
+                  const T* scales);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/softmax.cc b/paddle/fluid/operators/math/softmax.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eab31ec567d15a52661bf5ab2373819d8e1e7ddf
--- /dev/null
+++ b/paddle/fluid/operators/math/softmax.cc
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/math/softmax_impl.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template class SoftmaxFunctor<platform::CPUDeviceContext, float>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, double>;
+template class SoftmaxGradFunctor<platform::CPUDeviceContext, float>;
+template class SoftmaxGradFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
new file mode 100644
index 0000000000000000000000000000000000000000..733d7eeee6d08241783b8f854b25863f9b756c80
--- /dev/null
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/math/softmax_impl.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
+template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
+template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/softmax.h b/paddle/fluid/operators/math/softmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7d67d5f12d83f015297e75c730de27566e5489b
--- /dev/null
+++ b/paddle/fluid/operators/math/softmax.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename DeviceContext, typename T>
+class SoftmaxFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor* X,
+                  framework::Tensor* Y);
+};
+
+template <typename DeviceContext, typename T>
+class SoftmaxGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor* y,
+                  const framework::Tensor* y_grad, framework::Tensor* x_grad);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7c61cb647e899e25f3d9806c993395e898794ab
--- /dev/null
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+struct ValueClip {
+  HOSTDEVICE T operator()(const T& x) const {
+    const T kThreshold = -64.;
+    return x < kThreshold ? kThreshold : x;
+  }
+};
+
+template <typename DeviceContext, typename T>
+void SoftmaxFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
+                                                  const framework::Tensor* X,
+                                                  framework::Tensor* Y) {
+  auto logits = EigenMatrix<T>::From(*X);
+  auto softmax = EigenMatrix<T>::From(*Y);
+
+  const int kBatchDim = 0;
+  const int kClassDim = 1;
+
+  const int batch_size = logits.dimension(kBatchDim);
+  const int num_classes = logits.dimension(kClassDim);
+
+  Eigen::DSizes<int, 1> along_class(kClassDim);
+  Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+  Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+
+  auto shifted_logits = (logits -
+                         logits.maximum(along_class)
+                             .eval()
+                             .reshape(batch_by_one)
+                             .broadcast(one_by_class))
+                            .unaryExpr(ValueClip<T>());
+
+  softmax.device(*context.eigen_device()) = shifted_logits.exp();
+  softmax.device(*context.eigen_device()) = (softmax *
+                                             softmax.sum(along_class)
+                                                 .inverse()
+                                                 .eval()
+                                                 .reshape(batch_by_one)
+                                                 .broadcast(one_by_class));
+}
+
+template <typename DeviceContext, typename T>
+void SoftmaxGradFunctor<DeviceContext, T>::operator()(
+    const DeviceContext& context, const framework::Tensor* y,
+    const framework::Tensor* y_grad, framework::Tensor* x_grad) {
+  auto softmax = EigenMatrix<T>::From(*y);
+  auto softmax_grad = EigenMatrix<T>::From(*y_grad);
+  auto logits_grad = EigenMatrix<T>::From(*x_grad);
+
+  const int kBatchDim = 0;
+  const int kClassDim = 1;
+
+  const int batch_size = softmax.dimension(kBatchDim);
+  const int num_classes = softmax.dimension(kClassDim);
+
+  Eigen::DSizes<int, 1> along_class(kClassDim);
+  Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+  Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+
+  auto dot = (softmax * softmax_grad)
+                 .sum(along_class)
+                 .eval()
+                 .reshape(batch_by_one)
+                 .broadcast(one_by_class);
+  logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) * softmax;
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/unpooling.cc b/paddle/fluid/operators/math/unpooling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e02bc02e0022b82085e29ac6c83677c0accfce49
--- /dev/null
+++ b/paddle/fluid/operators/math/unpooling.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/unpooling.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+class Unpool2dMaxFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          output_data[index] = input_data[i];
+        }
+        input_data += input_feasize;
+        indices_data += input_feasize;
+        output_data += output_feasize;
+      }
+    }
+  }
+};
+template <class T>
+class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const int* indices_data = indices.data<int>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          input_grad_data[i] = output_grad_data[index];
+        }
+        input_grad_data += input_feasize;
+        indices_data += input_feasize;
+        output_grad_data += output_feasize;
+      }
+    }
+  }
+};
+template class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, float>;
+template class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, double>;
+template class Unpool2dMaxFunctor<platform::CPUDeviceContext, float>;
+template class Unpool2dMaxFunctor<platform::CPUDeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2e74270fdf16b470ab6438a3283525c725b2d01b
--- /dev/null
+++ b/paddle/fluid/operators/math/unpooling.cu
@@ -0,0 +1,128 @@
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/unpooling.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
+                                  const int* indices_data,
+                                  const int input_height, const int input_width,
+                                  const int channels, T* output_data,
+                                  const int output_height,
+                                  const int output_width) {
+  int in_n_stride = input_height * input_width * channels;
+  int in_c_stride = input_height * input_width;
+  int out_n_stride = output_height * output_width * channels;
+  int out_c_stride = output_height * output_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int bidx = i / in_n_stride;
+    int boffset = i % in_n_stride;
+    int cidx = boffset / in_c_stride;
+    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
+    int out_index = indices_data[i];
+    PADDLE_ASSERT(out_index < out_c_stride);
+    output_data[out_offset + out_index] = input_data[i];
+  }
+}
+template <typename T>
+__global__ void KernelUnpool2dMaxGrad(
+    const int nthreads, const T* input_data, const int* indices_data,
+    const int input_height, const int input_width, const int channels,
+    const T* output_data, const T* output_grad, const int output_height,
+    const int output_width, T* input_grad) {
+  int in_n_stride = input_height * input_width * channels;
+  int in_c_stride = input_height * input_width;
+  int out_n_stride = output_height * output_width * channels;
+  int out_c_stride = output_height * output_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int bidx = i / in_n_stride;
+    int boffset = i % in_n_stride;
+    int cidx = boffset / in_c_stride;
+    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
+    int out_index = indices_data[i];
+    PADDLE_ASSERT(out_index < out_c_stride);
+    input_grad[i] = output_grad[out_offset + out_index];
+  }
+}
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class Unpool2dMaxFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int threads = 1024;
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool2dMax<T><<<grid, threads, 0, context.stream()>>>(
+        input.numel(), input_data, indices_data, input_height, input_width,
+        output_channels, output_data, output_height, output_width);
+  }
+};
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int threads = 1024;
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool2dMaxGrad<T><<<grid, threads, 0, context.stream()>>>(
+        input.numel(), input_data, indices_data, input_height, input_width,
+        output_channels, output_data, output_grad_data, output_height,
+        output_width, input_grad_data);
+  }
+};
+template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, float>;
+template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, double>;
+template class Unpool2dMaxFunctor<platform::CUDADeviceContext, float>;
+template class Unpool2dMaxFunctor<platform::CUDADeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/unpooling.h b/paddle/fluid/operators/math/unpooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..f245ba7ba873e7f217c577e0c895f9a8d48e9cdf
--- /dev/null
+++ b/paddle/fluid/operators/math/unpooling.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename DeviceContext, typename T>
+class Unpool2dMaxFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output);
+};
+template <typename DeviceContext, class T>
+class Unpool2dMaxGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ded0bbc74477656310cb4d464c5709173f20f505
--- /dev/null
+++ b/paddle/fluid/operators/math/vol2col.cc
@@ -0,0 +1,200 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/vol2col.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * vol = [input_channels, input_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Vol2ColFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& vol,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* col) const {
+    PADDLE_ENFORCE(vol.dims().size() == 4);
+    PADDLE_ENFORCE(col->dims().size() == 7);
+
+    int input_channels = vol.dims()[0];
+    int input_depth = vol.dims()[1];
+    int input_height = vol.dims()[2];
+    int input_width = vol.dims()[3];
+    int filter_depth = col->dims()[1];
+    int filter_height = col->dims()[2];
+    int filter_width = col->dims()[3];
+    int output_depth = col->dims()[4];
+    int output_height = col->dims()[5];
+    int output_width = col->dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+
+    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
+                       ((dilations[0] * (filter_depth - 1) + 1))) /
+                              strides[0] +
+                          1,
+                      output_depth,
+                      "input_depth and output_depth are "
+                      "mismatching.");
+    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
+                       ((dilations[1] * (filter_height - 1) + 1))) /
+                              strides[1] +
+                          1,
+                      output_height,
+                      "input_height and output_height are "
+                      "mismatching.");
+    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
+                       ((dilations[2] * (filter_width - 1) + 1))) /
+                              strides[2] +
+                          1,
+                      output_width,
+                      "input_width and output_width are "
+                      "mismatching.");
+
+    const T* vol_data = vol.data<T>();
+    T* col_data = col->data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int c_in = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
+
+            int col_idx =
+                ((c * output_depth + d) * output_height + h) * output_width + w;
+            int vol_idx =
+                ((c_in * input_depth + d_pad) * input_height + h_pad) *
+                    input_width +
+                w_pad;
+            col_data[col_idx] =
+                (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
+                 w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
+                    ? static_cast<T>(0)
+                    : vol_data[vol_idx];
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * vol = [input_channels,input_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Col2VolFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* vol) const {
+    PADDLE_ENFORCE(vol->dims().size() == 4);
+    PADDLE_ENFORCE(col.dims().size() == 7);
+
+    int input_channels = vol->dims()[0];
+    int input_depth = vol->dims()[1];
+    int input_height = vol->dims()[2];
+    int input_width = vol->dims()[3];
+    int filter_depth = col.dims()[1];
+    int filter_height = col.dims()[2];
+    int filter_width = col.dims()[3];
+    int output_depth = col.dims()[4];
+    int output_height = col.dims()[5];
+    int output_width = col.dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+
+    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
+                       ((dilations[0] * (filter_depth - 1) + 1))) /
+                              strides[0] +
+                          1,
+                      output_depth,
+                      "input_depth and output_depth are "
+                      "mismatching.");
+    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
+                       ((dilations[1] * (filter_height - 1) + 1))) /
+                              strides[1] +
+                          1,
+                      output_height,
+                      "input_height and output_height are "
+                      "mismatching.");
+    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
+                       ((dilations[2] * (filter_width - 1) + 1))) /
+                              strides[2] +
+                          1,
+                      output_width,
+                      "input_width and output_width are "
+                      "mismatching.");
+    T* vol_data = vol->data<T>();
+    const T* col_data = col.data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int cIm = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
+
+            if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
+                w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
+              int vol_idx =
+                  ((cIm * input_depth + d_pad) * input_height + h_pad) *
+                      input_width +
+                  w_pad;
+
+              int col_idx =
+                  ((c * output_depth + d) * output_height + h) * output_width +
+                  w;
+              vol_data[vol_idx] += col_data[col_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Vol2ColFunctor<platform::CPUDeviceContext, float>;
+template class Vol2ColFunctor<platform::CPUDeviceContext, double>;
+template class Col2VolFunctor<platform::CPUDeviceContext, float>;
+template class Col2VolFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu
new file mode 100644
index 0000000000000000000000000000000000000000..35ef24c7f5ffe793a4aefe69807da5ffcf5ced4a
--- /dev/null
+++ b/paddle/fluid/operators/math/vol2col.cu
@@ -0,0 +1,262 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <class T>
+__global__ void vol2col(int num_kernels, const T* data_vol, int depth,
+                        int height, int width, int dilation_d, int dilation_h,
+                        int dilation_w, int filter_depth, int filter_height,
+                        int filter_width, int stride_depth, int stride_height,
+                        int stride_width, int padding_depth, int padding_height,
+                        int padding_width, int output_detph, int output_height,
+                        int output_width, T* data_col) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    int w_out = index % output_width;
+    int h_out = (index / output_width) % output_height;
+    int d_out = (index / output_width / output_height) % output_detph;
+    int channel_in = index / output_width / output_height / output_detph;
+    int channel_out = channel_in * filter_depth * filter_height * filter_width;
+    int w_in = w_out * stride_width - padding_width;
+    int h_in = h_out * stride_height - padding_height;
+    int d_in = d_out * stride_depth - padding_depth;
+
+    data_col += ((channel_out * output_detph + d_out) * output_height + h_out) *
+                    output_width +
+                w_out;
+    data_vol += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
+    for (int k = 0; k < filter_depth; ++k) {
+      for (int i = 0; i < filter_height; ++i) {
+        for (int j = 0; j < filter_width; ++j) {
+          int d = d_in + k * dilation_d;
+          int h = h_in + i * dilation_h;
+          int w = w_in + j * dilation_w;
+          int col_idx = (k * dilation_d * height + i * dilation_h) * width +
+                        j * dilation_w;
+          *data_col = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
+                       w < width)
+                          ? data_vol[col_idx]
+                          : 0;
+          data_col += output_detph * output_height * output_width;
+        }
+      }
+    }
+  }
+}
+
+/*
+ * im = [input_channels,intpu_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Vol2ColFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& vol,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* col) const {
+    PADDLE_ENFORCE(vol.dims().size() == 4);
+    PADDLE_ENFORCE(col->dims().size() == 7);
+
+    int input_channels = vol.dims()[0];
+    int input_depth = vol.dims()[1];
+    int input_height = vol.dims()[2];
+    int input_width = vol.dims()[3];
+    int filter_depth = col->dims()[1];
+    int filter_height = col->dims()[2];
+    int filter_width = col->dims()[3];
+    int output_depth = col->dims()[4];
+    int output_height = col->dims()[5];
+    int output_width = col->dims()[6];
+
+    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
+                       ((dilations[0] * (filter_depth - 1) + 1))) /
+                              strides[0] +
+                          1,
+                      output_depth,
+                      "input_depth and output_depth are "
+                      "Mismatching.");
+    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
+                       ((dilations[1] * (filter_height - 1) + 1))) /
+                              strides[1] +
+                          1,
+                      output_height,
+                      "input_height and output_height are "
+                      "Mismatching.");
+    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
+                       ((dilations[2] * (filter_width - 1) + 1))) /
+                              strides[2] +
+                          1,
+                      output_width,
+                      "input_width and output_width are "
+                      "Mismatching.");
+
+    int num_outputs =
+        input_channels * output_depth * output_height * output_width;
+
+    const int threads = 1024;
+    const int blocks = (num_outputs + 1024 - 1) / 1024;
+    vol2col<T><<<blocks, threads, 0, context.stream()>>>(
+        num_outputs, vol.data<T>(), input_depth, input_height, input_width,
+        dilations[0], dilations[1], dilations[2], filter_depth, filter_height,
+        filter_width, strides[0], strides[1], strides[2], paddings[0],
+        paddings[1], paddings[2], output_depth, output_height, output_width,
+        col->data<T>());
+  }
+};
+
+template <class T>
+__global__ void col2vol(int num_kernels, const T* data_col, int depth,
+                        int height, int width, int dilation_d, int dilation_h,
+                        int dilation_w, int filter_depth, int filter_height,
+                        int filter_width, int stride_depth, int stride_height,
+                        int stride_width, int padding_depth, int padding_height,
+                        int padding_width, int output_detph, int output_height,
+                        int output_width, T* data_vol) {
+  const int d_filter_depth = dilation_d * (filter_depth - 1) + 1;
+  const int d_filter_height = dilation_h * (filter_height - 1) + 1;
+  const int d_filter_width = dilation_w * (filter_width - 1) + 1;
+
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    T src_val = 0;
+    int w = index % width + padding_width;
+    int h = (index / width) % height + padding_height;
+    int d = (index / width / height) % depth + padding_depth;
+    int c = index / width / height / depth;
+
+    // compute the start and end of the output
+    int w_col_start =
+        (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1;
+    int w_col_end = min(w / stride_width + 1, output_width);
+    int h_col_start =
+        (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1;
+    int h_col_end = min(h / stride_height + 1, output_height);
+    int d_col_start =
+        (d < d_filter_depth) ? 0 : (d - d_filter_depth) / stride_depth + 1;
+    int d_col_end = min(d / stride_depth + 1, output_detph);
+
+    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          int d_off = (d - d_col * stride_depth);
+          int h_off = (h - h_col * stride_height);
+          int w_off = (w - w_col * stride_width);
+          if (d_off % dilation_d == 0 && h_off % dilation_h == 0 &&
+              w_off % dilation_w == 0) {
+            d_off /= dilation_d;
+            h_off /= dilation_h;
+            w_off /= dilation_w;
+
+            int data_col_index =
+                (((((c * filter_depth + d_off) * filter_height + h_off) *
+                       filter_width +
+                   w_off)));
+            data_col_index =
+                ((data_col_index * output_detph + d_col) * output_height +
+                 h_col) *
+                    output_width +
+                w_col;
+            src_val += data_col[data_col_index];
+          }
+        }
+      }
+    }
+    data_vol[index] = src_val;
+  }
+}
+
+/*
+ * im = [input_channels, input_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Col2VolFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* vol) const {
+    PADDLE_ENFORCE(vol->dims().size() == 4);
+    PADDLE_ENFORCE(col.dims().size() == 7);
+
+    int input_channels = vol->dims()[0];
+    int input_depth = vol->dims()[1];
+    int input_height = vol->dims()[2];
+    int input_width = vol->dims()[3];
+    int filter_depth = col.dims()[1];
+    int filter_height = col.dims()[2];
+    int filter_width = col.dims()[3];
+    int output_depth = col.dims()[4];
+    int output_height = col.dims()[5];
+    int output_width = col.dims()[6];
+
+    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
+                       ((dilations[0] * (filter_depth - 1) + 1))) /
+                              strides[0] +
+                          1,
+                      output_depth,
+                      "input_depth and output_depth are "
+                      "Mismatching.");
+    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
+                       ((dilations[1] * (filter_height - 1) + 1))) /
+                              strides[1] +
+                          1,
+                      output_height,
+                      "input_height and output_height are "
+                      "Mismatching.");
+    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
+                       ((dilations[2] * (filter_width - 1) + 1))) /
+                              strides[2] +
+                          1,
+                      output_width,
+                      "input_width and output_width are "
+                      "Mismatching.");
+
+    int num_kernels = input_channels * input_depth * input_height * input_width;
+
+    const int threads = 1024;
+    const int blocks = (num_kernels + 1024 - 1) / 1024;
+
+    col2vol<T><<<blocks, threads, 0, context.stream()>>>(
+        num_kernels, col.data<T>(), input_depth, input_height, input_width,
+        dilations[0], dilations[1], dilations[2], filter_depth, filter_height,
+        filter_width, strides[0], strides[1], strides[2], paddings[0],
+        paddings[1], paddings[2], output_depth, output_height, output_width,
+        vol->data<T>());
+  }
+};
+
+template class Vol2ColFunctor<platform::CUDADeviceContext, float>;
+template class Vol2ColFunctor<platform::CUDADeviceContext, double>;
+template class Col2VolFunctor<platform::CUDADeviceContext, float>;
+template class Col2VolFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/vol2col.h b/paddle/fluid/operators/math/vol2col.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ce38b2d11f7c64f1004a73ecfc7d85a5a6346ba
--- /dev/null
+++ b/paddle/fluid/operators/math/vol2col.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+/*
+ * \brief Converts the feature data of four dimensions(CDHW) into a colData of
+ *        seven dimensions in the Vol2ColFunctor calculation,
+ *        And in the Col2VolFunctor calculation, it is reversed.
+ *
+ * \param volData   Vol data.
+ * \param volShape  The shape of volData,
+ *                 [input_channels, input_depth, input_height, input_width].
+ * \param colData  Column data.
+ * \param colShape The shape of colData.
+ *
+ * \param dilations    dilation data.
+ * \param 3-dimension  [dilation_depth, dilation_height, dilation_width].
+ *
+ * \param strides      stride data.
+ * \param 3-dimension  [stride_depth, stride_height, stride_width].
+ *
+ * \param paddings     padding data.
+ * \param 3-dimension  [d_pad, h_pad, w_pad].
+ *
+ * The shape of colData is:
+ * [input_channels, filter_depth, filter_height, filter_width, output_depth,
+ * output_height, output_width]
+ * So, it is easy to reshape into a convolution matrix for convolution
+ * calculation based on matrix multiplication.
+ * The shape of convolution matrix is [height, width], where the height is equal
+ * input_channels * filter_depth * filter_height * filter_width, and the width
+ * is equal output_depth * output_height * output_width.
+ *
+ * Reshape:
+ *     shape of colData           shape of convolution matrix
+ *     [input_channels,
+ *      filter_depth,
+ *      filter_height,
+ *      filter_width,      ======>      [height, width]
+ *      output_depth,
+ *      output_height,
+ *      output_width]
+ *
+ * \note The caller needs to ensure that volShape.inputChannels is equal to
+ *       colShape.inputChannels.
+ */
+template <typename DeviceContext, typename T>
+class Vol2ColFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& vol,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* col) const;
+};
+
+template <typename DeviceContext, typename T>
+class Col2VolFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& col,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* vol) const;
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..af0a900f80e9bec2c5cfd0ec7dc66beabba049d7
--- /dev/null
+++ b/paddle/fluid/operators/math/vol2col_test.cc
@@ -0,0 +1,127 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/vol2col.h"
+#include <gtest/gtest.h>
+#include <iostream>
+
+template <typename DeviceContext, typename Place>
+void testVol2col() {
+  paddle::framework::Tensor input;
+  paddle::framework::Tensor input_tmp;
+  paddle::framework::Tensor output;
+  paddle::framework::Tensor output_tmp;
+
+  auto* place = new Place();
+  DeviceContext* context = new DeviceContext(*place);
+
+  /**
+   * input = [[0, 1, 2,
+   *          3, 4, 5]
+   *          [6, 7, 8,
+   *          9, 10, 11]]
+   *
+   * output = [0, 1
+   *           1, 2
+   *           3, 4
+   *           4, 5
+   *           6, 7
+   *           7, 8
+   *           9, 10
+   *           10, 11]
+   *
+   * col2vol = [[0, 2, 2,
+   *             3, 8, 5]
+   *            [6, 14, 8,
+   *             9, 20, 11]]
+   *
+   */
+  int input_depth = 2;
+  int input_height = 2;
+  int input_width = 3;
+  int filter_size = 2;
+  std::vector<int> strides({1, 1, 1});
+  std::vector<int> paddings({0, 0, 0});
+  std::vector<int> dilations({1, 1, 1});
+  int output_depth =
+      (input_depth - filter_size + 2 * paddings[0]) / strides[0] + 1;
+  int output_height =
+      (input_height - filter_size + 2 * paddings[1]) / strides[1] + 1;
+  int output_width =
+      (input_width - filter_size + 2 * paddings[2]) / strides[2] + 1;
+
+  // Vol2Col test
+  float* input_ptr =
+      input_tmp.mutable_data<float>({1, input_depth, input_height, input_width},
+                                    paddle::platform::CPUPlace());
+  float arr[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input_ptr, arr, 12 * sizeof(float));
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    Copy(input_tmp, *place, *context, &input);
+  }
+  output.mutable_data<float>({1, filter_size, filter_size, filter_size,
+                              output_depth, output_height, output_width},
+                             *place);
+
+  paddle::operators::math::Vol2ColFunctor<DeviceContext, float> vol2col;
+  vol2col(*context, input, dilations, strides, paddings, &output);
+
+  float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
+  float* out_cfo_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_cfo_ptr = output.data<float>();
+  } else {
+    Copy(output, paddle::platform::CPUPlace(), *context, &output_tmp);
+    out_cfo_ptr = output_tmp.data<float>();
+  }
+
+  for (int i = 0; i < 16; ++i) {
+    EXPECT_EQ(out_cfo_ptr[i], vol_2_col[i]);
+  }
+
+  // Col2Vol test
+  float col_2_vol[] = {0, 2, 2, 3, 8, 5, 6, 14, 8, 9, 20, 11};
+  memset(input_ptr, 0, 12 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    Copy(input_tmp, *place, *context, &input);
+  }
+
+  paddle::operators::math::Col2VolFunctor<DeviceContext, float> col2vol;
+  col2vol(*context, output, dilations, strides, paddings, &input);
+
+  float* in_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
+    in_ptr = input_tmp.data<float>();
+  }
+
+  for (int i = 0; i < 12; ++i) {
+    EXPECT_EQ(in_ptr[i], col_2_vol[i]);
+  }
+}
+
+TEST(math, vol2col) {
+  testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
+#ifdef PADDLE_WITH_CUDA
+  testVol2col<paddle::platform::CUDADeviceContext,
+              paddle::platform::CUDAPlace>();
+#endif  // PADDLE_WITH_CUDA
+}
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..267b0057bf4894705b7e6eddb8e2e2eaa5c18c8e
--- /dev/null
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -0,0 +1,244 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/matmul_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class MatMulOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "Input(X) of MatMulOp should not be null.");
+    PADDLE_ENFORCE(context->HasInput("Y"),
+                   "Input(Y) of MatMulOp should not be null.");
+    PADDLE_ENFORCE(context->HasOutput("Out"),
+                   "Output(Out) of MatMulOp should not be null.");
+
+    auto dim_x = context->GetInputDim("X");
+    auto dim_y = context->GetInputDim("Y");
+    bool transpose_x = context->Attrs().Get<bool>("transpose_X");
+    bool transpose_y = context->Attrs().Get<bool>("transpose_Y");
+
+    PADDLE_ENFORCE_GE(dim_x.size(), 1,
+                      "Input tensor X must be at least 1-dimensional.");
+    PADDLE_ENFORCE_GE(dim_y.size(), 1,
+                      "Input tensor Y must be at least 1-dimensional.");
+
+    std::vector<int64_t> out_dim;
+    int64_t batch_count = 1;
+    if (dim_x.size() > 3) {
+      PADDLE_ENFORCE_EQ(
+          dim_y.size(), dim_x.size(),
+          "The dimensions of X and Y must be the same, and both of "
+          "them should be %d-dimensional.",
+          dim_x.size());
+
+      // The first rank-2 dimensions are accumulated on the batch_count, and the
+      // last two dimensions are used for matrix multiplication.
+      for (int j = 0; j < dim_x.size() - 2; ++j) {
+        PADDLE_ENFORCE_EQ(dim_y[j], dim_x[j],
+                          "The %d-th dimension of X and Y must be the same.",
+                          j);
+        out_dim.push_back(dim_x[j]);
+        batch_count *= dim_x[j];
+      }
+    }
+
+    int M = 0, N = 0, KX = 0, KY = 0, batchCountX = 0, batchCountY = 0;
+    bool remove_initial_dim = false, remove_final_dim = false;
+
+    switch (dim_x.size()) {
+      case 1:
+        if (transpose_x) {
+          M = dim_x[0];
+          KX = 1;
+        } else {
+          M = 1;
+          KX = dim_x[0];
+          remove_initial_dim = true;
+        }
+        break;
+      case 2:
+        M = transpose_x ? dim_x[1] : dim_x[0];
+        KX = transpose_x ? dim_x[0] : dim_x[1];
+        break;
+      case 3:
+        batchCountX = dim_x[0];
+        M = transpose_x ? dim_x[2] : dim_x[1];
+        KX = transpose_x ? dim_x[1] : dim_x[2];
+        break;
+      default:
+        batchCountX = batch_count;
+        size_t mat_s = dim_x.size() - 2;
+        M = transpose_x ? dim_x[mat_s + 1] : dim_x[mat_s];
+        KX = transpose_x ? dim_x[mat_s] : dim_x[mat_s + 1];
+        break;
+    }
+
+    switch (dim_y.size()) {
+      case 1:
+        if (transpose_y) {
+          N = dim_y[0];
+          KY = 1;
+        } else {
+          N = 1;
+          KY = dim_y[0];
+          remove_final_dim = true;
+        }
+        break;
+      case 2:
+        KY = transpose_y ? dim_y[1] : dim_y[0];
+        N = transpose_y ? dim_y[0] : dim_y[1];
+        break;
+      case 3:
+        batchCountY = dim_y[0];
+        KY = transpose_y ? dim_y[2] : dim_y[1];
+        N = transpose_y ? dim_y[1] : dim_y[2];
+        break;
+      default:
+        batchCountY = batch_count;
+        size_t mat_s = dim_y.size() - 2;
+        KY = transpose_y ? dim_y[mat_s + 1] : dim_y[mat_s];
+        N = transpose_y ? dim_y[mat_s] : dim_y[mat_s + 1];
+    }
+
+    PADDLE_ENFORCE_EQ(
+        KX, KY,
+        "First matrix's width must be equal with second matrix's height.");
+    if (batchCountX && batchCountY) {
+      PADDLE_ENFORCE_EQ(
+          batchCountX, batchCountY,
+          "When Input(X) and Input(Y) are both three dimensional, they "
+          "must have the same batch dimension.");
+    }
+    int batchCount = std::max(batchCountX, batchCountY);
+
+    std::vector<int64_t> dim_out;
+    if (batchCount) {
+      if (dim_x.size() > 3) {
+        dim_out.insert(dim_out.begin(), out_dim.begin(), out_dim.end());
+      } else {
+        dim_out.push_back(batchCount);
+      }
+    }
+    if (!remove_initial_dim) {
+      dim_out.push_back(M);
+    }
+    if (!remove_final_dim) {
+      dim_out.push_back(N);
+    }
+    if (dim_out.size() == 0) {
+      // We don't support 0-dimensional Tensors (scalars), so instead
+      // treat the output as a Tensor of shape (1, ) in this case.
+      dim_out.push_back(1);
+    }
+    context->SetOutputDim("Out", framework::make_ddim(dim_out));
+    context->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MatMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of MatMul op");
+    AddInput("Y", "The second input of MatMul op");
+    AddOutput("Out", "The output of MatMul op");
+    AddAttr<bool>("transpose_X",
+                  R"DOC(If true, use the transpose of `X`.
+        )DOC")
+        .SetDefault(false);
+    AddAttr<bool>("transpose_Y",
+                  R"DOC(If true, use the transpose of `Y`.
+        )DOC")
+        .SetDefault(false);
+    AddComment(R"DOC(
+MatMul Operator.
+
+
+This operator is used to perform (batched) matrix multiplication
+over the last two dimensions of the input tensors `X` and `Y`.
+
+If a transpose flag is specified, the last two dimensions of the
+tensor are transposed. If the tensor is rank-1 of shape [D], then
+for `X` it is treated as [1, D] in nontransposed form and as [D, 1]
+in transposed form, whereas for `Y` it is the opposite: It is treated
+as [D, 1] in nontransposed form and as [1, D] in transposed form.
+
+Examples without transpose:
+- X: [K], Y: [K] => Out: [1]
+- X: [K], Y: [K, N] => Out: [N]
+- X: [B, M, K], Y: [K] => Out: [B, M]
+- X: [M, K], Y: [B, K, N] => Out: [B, M, N]
+- X: [B, M, K], Y: [B, K, N] => Out: [B, M, N]
+- X: [B, ..., M, K], Y: [B, ..., K, N] => Out: [B, ..., M, N]
+
+The behavior is designed to be similar to the `numpy.matmul` function.
+The differences are:
+- When the rank of the input data is less than or equal to 3, it
+  is similar to the `numpy.matmul` function.
+- When the rank of the input is greater than 3, the rank of X and
+  Y must be equal, and the first `rank - 2` dimensions must be equal.
+- We add `transpose_X` and `transpose_Y` flags.
+
+Both the input `X` and `Y` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD information with input `X`.
+
+)DOC");
+  }
+};
+
+class MatMulOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(context->HasInput("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = context->GetInputDim("X");
+    auto y_dims = context->GetInputDim("Y");
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+
+    if (context->HasOutput(x_grad_name)) {
+      context->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (context->HasOutput(y_grad_name)) {
+      context->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(matmul, ops::MatMulOp, ops::MatMulOpMaker, matmul_grad,
+            ops::MatMulOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    matmul, ops::MatMulKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    matmul_grad,
+    ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/matmul_op.cu.cc b/paddle/fluid/operators/matmul_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..988787f0fe40ccdb266327dbff400a72f5dc448b
--- /dev/null
+++ b/paddle/fluid/operators/matmul_op.cu.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/matmul_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    matmul, ops::MatMulKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    matmul_grad,
+    ops::MatMulGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/matmul_op.h b/paddle/fluid/operators/matmul_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4cae3c91cb03980b915a70242e368852322b365
--- /dev/null
+++ b/paddle/fluid/operators/matmul_op.h
@@ -0,0 +1,242 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/matmul.h"
+
+namespace paddle {
+namespace operators {
+namespace matmul_detail {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+using framework::make_ddim;
+using framework::vectorize;
+
+template <typename DeviceContext, typename T>
+class MatMulKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor& x = *context.Input<Tensor>("X");
+    const Tensor& y = *context.Input<Tensor>("Y");
+    Tensor* out = context.Output<Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+    bool transpose_x = context.Attr<bool>("transpose_X");
+    bool transpose_y = context.Attr<bool>("transpose_Y");
+
+    math::MatMulFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), x, transpose_x, y,
+        transpose_y, T(1), out, T(0));
+  }
+};
+
+template <typename T>
+inline Tensor Reshape(const Tensor& input, const DDim& dims) {
+  Tensor output;
+  output.ShareDataWith(input);
+  output.Resize(dims);
+  return output;
+}
+
+// Reshape a rank-3 tensor from P x M x N to (P * M) x N.
+// Identity op if the tensor is not of rank 3.
+template <typename T>
+Tensor CombineBatchAndM(const Tensor& input) {
+  Tensor output;
+  output.ShareDataWith(input);
+  auto in_dims = input.dims();
+  if (in_dims.size() == 3) {
+    std::vector<int64_t> out_dims = {in_dims[0] * in_dims[1], in_dims[2]};
+    output.Resize(make_ddim(out_dims));
+  }
+  return output;
+}
+
+// Reshape a rank-3 tensor from P x M x N to M x (P * N).
+// (Warning: This requires transposing data and writes into new memory.)
+// Identity op if the tensor is not of rank 3.
+template <typename DeviceContext, typename T>
+Tensor CombineBatchAndN(const DeviceContext& context, const Tensor& input) {
+  Tensor output;
+  auto in_dims = input.dims();
+  if (in_dims.size() == 3) {
+    output.Resize({in_dims[1], in_dims[0], in_dims[2]});
+    output.mutable_data<T>(context.GetPlace());
+    std::vector<int> axis = {1, 0, 2};
+    math::Transpose<DeviceContext, T, 3> trans;
+    trans(context, input, &output, axis);
+    std::vector<int64_t> out_dims = {in_dims[1], in_dims[0] * in_dims[2]};
+    output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
+  } else {
+    output.ShareDataWith(input);
+  }
+  return output;
+}
+
+// Using dimensional constraints on matrix multiplication, it is
+// straight-forward to check the following table for when X and Y
+// are both matrices.
+//
+// transpose_X | False    | True     | False    | True
+// transpose_Y | False    | False    | True     | True
+// -----------+----------+----------+----------+-----------
+//        dX = | dOut Y^T | Y dOut^T | dOut Y   | Y^T dOut^T
+//        dY = | X^T dOut | X dOut   | dOut^T X | dOut^T X^T
+//
+// When X is a vector of size K, we treat it instead as a matrix of shape
+// (1, K). Similarly, when Y is a vector of size K, we treat it instead as
+// a matrix of shape (K, 1).
+//
+// When X and Y are both 3-dimensional tensors, then the first dimension
+// the batch dimension can be ignored and the exact same formulas apply
+// as for two matrices.
+//
+// Finally, when, e.g., X is a 3-dimensional tensor but Y is a matrix, we end
+// up with formulas like
+//
+//   dY_{ij} = \sum_{p, m} X_{pmi} dOut_{pmj}
+//
+// To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N
+// to X: (P * M) x K, dOut: (P * M) x N.
+template <typename DeviceContext, typename T>
+class MatMulGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor& x = *context.Input<Tensor>("X");
+    const Tensor& y = *context.Input<Tensor>("Y");
+    const Tensor& dout = *context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* dx = context.Output<Tensor>(framework::GradVarName("X"));
+    Tensor* dy = context.Output<Tensor>(framework::GradVarName("Y"));
+    bool transpose_x = context.Attr<bool>("transpose_X");
+    bool transpose_y = context.Attr<bool>("transpose_Y");
+
+    std::vector<int64_t> x_dims = vectorize(x.dims());
+    std::vector<int64_t> y_dims = vectorize(y.dims());
+
+    // If X is a vector, reshape it to a matrix.
+    if (x_dims.size() == 1) {
+      x_dims.insert(x_dims.begin(), 1);
+    }
+
+    // If Y is a vector, reshape it to a matrix.
+    if (y_dims.size() == 1) {
+      y_dims.push_back(1);
+    }
+
+    int batch_count = 0;
+    // The first rank-2 dimensions are accumulated on the batch_count, and the
+    // last two dimensions are used for matrix multiplication.
+    if (x_dims.size() > 3) {
+      batch_count = accumulate(x_dims.begin(), x_dims.end() - 2, 1,
+                               std::multiplies<int>());
+    }
+    // Fix the dOut dimensions.
+    int M = 0, N = 0, batchCountX = 0, batchCountY = 0;
+
+    switch (x_dims.size()) {
+      case 2:
+        M = transpose_x ? x_dims[1] : x_dims[0];
+        break;
+      case 3:
+        batchCountX = x_dims[0];
+        M = transpose_x ? x_dims[2] : x_dims[1];
+        break;
+      default:
+        batchCountX = batch_count;
+        size_t mat_s = x_dims.size() - 2;
+        M = transpose_x ? x_dims[mat_s + 1] : x_dims[mat_s];
+    }
+
+    switch (y_dims.size()) {
+      case 2:
+        N = transpose_y ? y_dims[0] : y_dims[1];
+        break;
+      case 3:
+        batchCountY = y_dims[0];
+        N = transpose_y ? y_dims[1] : y_dims[2];
+        break;
+      default:
+        batchCountY = batch_count;
+        size_t mat_s = y_dims.size() - 2;
+        N = transpose_y ? y_dims[mat_s] : y_dims[mat_s + 1];
+    }
+    if (batchCountX && batchCountY) {
+      PADDLE_ENFORCE_EQ(
+          batchCountX, batchCountY,
+          "When Input(X) and Input(Y) are both three dimensional, they "
+          "must have the same batch dimension.");
+    }
+    int batchCount = std::max(batchCountX, batchCountY);
+    std::vector<int64_t> dout_dims = {M, N};
+    if (batchCount) {
+      if (x_dims.size() > 3) {
+        dout_dims.insert(dout_dims.begin(), x_dims.begin(), x_dims.end() - 2);
+      } else {
+        dout_dims.insert(dout_dims.begin(), batchCount);
+      }
+    }
+    Tensor X = Reshape<T>(x, make_ddim(x_dims));
+    Tensor Y = Reshape<T>(y, make_ddim(y_dims));
+    Tensor dOut = Reshape<T>(dout, make_ddim(dout_dims));
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    if (dx) {
+      dx->mutable_data<T>(context.GetPlace());
+      const Tensor& dOut_for_dX =
+          (x_dims.size() == 2 && y_dims.size() == 3)
+              ? CombineBatchAndN<DeviceContext, T>(dev_ctx, dOut)
+              : dOut;
+      if (x_dims.size() == 2 && y_dims.size() == 3) {
+        Y = transpose_y ? CombineBatchAndM<T>(Y)
+                        : CombineBatchAndN<DeviceContext, T>(dev_ctx, Y);
+      }
+      if (transpose_x) {
+        math::MatMulFunctor<DeviceContext, T>()(
+            dev_ctx, Y, transpose_y, dOut_for_dX, transpose_x, T(1), dx, T(0));
+      } else {
+        math::MatMulFunctor<DeviceContext, T>()(
+            dev_ctx, dOut_for_dX, transpose_x, Y, !transpose_y, T(1), dx, T(0));
+      }
+    }
+
+    if (dy) {
+      dy->mutable_data<T>(context.GetPlace());
+      const Tensor& dOut_for_dY = (y_dims.size() == 2 && x_dims.size() == 3)
+                                      ? CombineBatchAndM<T>(dOut)
+                                      : dOut;
+      if (y_dims.size() == 2 && x_dims.size() == 3) {
+        X = transpose_x ? CombineBatchAndN<DeviceContext, T>(dev_ctx, X)
+                        : CombineBatchAndM<T>(X);
+        dOut = CombineBatchAndM<T>(dOut);
+      }
+      if (transpose_y) {
+        math::MatMulFunctor<DeviceContext, T>()(
+            dev_ctx, dOut_for_dY, transpose_y, X, transpose_x, T(1), dy, T(0));
+      } else {
+        math::MatMulFunctor<DeviceContext, T>()(
+            dev_ctx, X, !transpose_x, dOut_for_dY, transpose_y, T(1), dy, T(0));
+      }
+    }
+  }
+};
+}  // namespace matmul_detail
+
+using matmul_detail::MatMulKernel;
+using matmul_detail::MatMulGradKernel;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/max_sequence_len_op.cc b/paddle/fluid/operators/max_sequence_len_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eff8b927e52c94a4e19bb10c644cbaa34a7a0581
--- /dev/null
+++ b/paddle/fluid/operators/max_sequence_len_op.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+class MaxSeqenceLenOp : public framework::OperatorBase {
+ public:
+  MaxSeqenceLenOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    auto &rank_table =
+        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    int64_t *out_ptr = out->mutable_data<int64_t>({1}, platform::CPUPlace());
+    *out_ptr = rank_table.items()[0].length;
+  }
+};
+
+class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MaxSeqenceLenOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("RankTable", "The lod_rank_table.");
+    AddOutput("Out", "The max sequence length.");
+    AddComment(
+        R"DOC(Calculate the max sequence length through lod_rank_table.)DOC");
+  }
+};
+
+class MaxSeqenceLenInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("RankTable"));
+    context->SetOutputDim("Out", {1});
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(max_sequence_len, paddle::operators::MaxSeqenceLenOp,
+                  paddle::operators::MaxSeqenceLenOpProtoMaker,
+                  paddle::operators::MaxSeqenceLenInferShape,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8ce12cd4c4d93327dfcf9eb40ae1ff429f703419
--- /dev/null
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License. */
+
+#include "paddle/fluid/operators/maxout_op.h"
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MaxOutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of maxout operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of maxout operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of feature.");
+    AddAttr<int>(
+        "groups",
+        R"DOC("Specifies how many groups the input tensor will be split"
+        "in the channel dimension. And the number of output channel is "
+        "the number of channels divided by groups.."
+        )DOC");
+    AddComment(R"DOC(
+MaxOut Operator.
+
+Assumed the input shape is (N, Ci, H, W).
+The output shape is (N, Co, H, W).
+Then $Co = Ci / groups$ and the operator formula is as follows:
+
+$$
+y_{si+j} = \max_k x_{gsi + sk + j} \\
+g = groups \\
+s = \frac{input.size}{num\_channels} \\
+0 \le i < \frac{num\_channels}{groups} \\
+0 \le j < s \\
+0 \le k < groups
+$$
+
+Please refer to Paper:
+  - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
+  - Multi-digit Number Recognition from Street View \
+    Imagery using Deep Convolutional Neural Networks: \
+    https://arxiv.org/pdf/1312.6082v4.pdf
+
+)DOC");
+  }
+};
+
+class MaxOutOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of MaxoutOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of MaxoutOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    int groups = ctx->Attrs().Get<int>("groups");
+    // check groups > 1
+    PADDLE_ENFORCE_GT(groups, 1, "groups should be larger than 1 in maxoutop");
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1] / groups});
+    output_shape.push_back(in_x_dims[2]);
+    output_shape.push_back(in_x_dims[3]);
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class MaxOutOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
+            ops::MaxOutOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    maxout, ops::MaxOutKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    maxout_grad,
+    ops::MaxOutGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/maxout_op.cu.cc b/paddle/fluid/operators/maxout_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f3f45c90cde754bcbf985092c5cbf31f134a2eee
--- /dev/null
+++ b/paddle/fluid/operators/maxout_op.cu.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/maxout_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    maxout, ops::MaxOutKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MaxOutKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    maxout_grad,
+    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5de3e3760b99c9bde9dd86e8851dc2f65e4b2d2
--- /dev/null
+++ b/paddle/fluid/operators/maxout_op.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/maxouting.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class MaxOutKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+    int groups = context.template Attr<int>("groups");
+
+    math::MaxOutFunctor<DeviceContext, T> maxout_forward;
+    maxout_forward(context.template device_context<DeviceContext>(), *in_x, out,
+                   groups);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MaxOutGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    const Tensor* out = context.Input<Tensor>("Out");
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    int groups = context.template Attr<int>("groups");
+    auto& device_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, in_x_grad, static_cast<T>(0.0));
+      math::MaxOutGradFunctor<DeviceContext, T> maxout_backward;
+      maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1043820345a21cc3a7bbdf45d2914f9319c8e708
--- /dev/null
+++ b/paddle/fluid/operators/mean_op.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mean_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MeanOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of MeanOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of MeanOp should not be null.");
+    ctx->SetOutputDim("Out", {1});
+  }
+};
+
+class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MeanOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of mean op");
+    AddOutput("Out", "The output of mean op");
+    AddComment(R"DOC(
+Mean Operator.
+
+Out is a scalar which is the mean of all elements in X. 
+
+)DOC");
+  }
+};
+
+class MeanGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", framework::GradVarName("X"));
+  }
+};
+
+class MeanGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
+    grad_op->SetType("mean_grad");
+    grad_op->SetInput("X", Input("X"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker);
+REGISTER_OPERATOR(mean_grad, ops::MeanGradOp);
+REGISTER_OP_CPU_KERNEL(
+    mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MeanKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    mean_grad, ops::MeanGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MeanGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ccf2248760a551174953b8b55dc9a69454074885
--- /dev/null
+++ b/paddle/fluid/operators/mean_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/fluid/operators/mean_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    mean_grad, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae162287da6a5b9e826dec9552262e73468ee58a
--- /dev/null
+++ b/paddle/fluid/operators/mean_op.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class MeanKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+
+    output->mutable_data<T>(context.GetPlace());
+
+    auto X = EigenVector<T>::Flatten(*input);
+    auto y = EigenScalar<T>::From(*output);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    y.device(place) = X.mean();
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MeanGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto OG = context.Input<Tensor>(framework::GradVarName("Out"));
+    PADDLE_ENFORCE(OG->numel() == 1, "Mean Gradient should be scalar");
+    auto IG = context.Output<Tensor>(framework::GradVarName("X"));
+    IG->mutable_data<T>(context.GetPlace());
+
+    T ig_size = static_cast<T>(IG->numel());
+    Eigen::DSizes<int, 1> bcast(ig_size);
+
+    EigenVector<T>::Flatten(*IG).device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
+        (EigenVector<T>::From(*OG) / ig_size).broadcast(bcast);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..255f55334093213df867852e4d222f0e227e8c5d
--- /dev/null
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -0,0 +1,186 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+using LoD = framework::LoD;
+
+class MergeLoDTensorOp : public framework::OperatorBase {
+ public:
+  MergeLoDTensorOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
+    auto &in_true = scope.FindVar(Input("InTrue"))->Get<framework::LoDTensor>();
+    auto &in_false =
+        scope.FindVar(Input("InFalse"))->Get<framework::LoDTensor>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    auto level = static_cast<size_t>(Attr<int>("level"));
+
+    auto &mask_dim = mask.dims();
+
+    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
+    if (platform::is_cpu_place(mask.place())) {
+      cpu_mask->ShareDataWith(mask);
+    } else if (platform::is_gpu_place(mask.place())) {
+#ifdef PADDLE_WITH_CUDA
+      framework::Copy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
+#else
+      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+#endif
+    }
+    auto *mask_data = cpu_mask->data<bool>();
+
+    int rank = in_true.dims().size();
+    platform::Place place = in_true.place();
+    std::type_index data_type = in_true.type();
+    framework::DDim in_true_dims =
+        framework::slice_ddim(in_true.dims(), 1, rank);
+
+    int64_t batch_size = in_true.dims()[0] + in_false.dims()[0];
+
+    auto in_true_dim_vec = framework::vectorize(in_true_dims);
+    in_true_dim_vec.insert(in_true_dim_vec.begin(), batch_size);
+
+    framework::DDim out_dims = framework::make_ddim(in_true_dim_vec);
+    out->Resize(out_dims);
+    out->mutable_data(place, data_type);
+
+    auto *out_lod = out->mutable_lod();
+    out_lod->clear();
+    size_t out_offset = 0;
+
+    // Build LoDTensor `out`
+
+    size_t in_true_idx = 0;
+    size_t in_false_idx = 0;
+    for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
+      const framework::LoDTensor *input = nullptr;
+      size_t *in_idx = nullptr;
+      if (static_cast<int>(mask_data[i]) == 0) {
+        input = &in_false;
+        in_idx = &in_false_idx;
+      } else {
+        input = &in_true;
+        in_idx = &in_true_idx;
+      }
+      auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+          input->lod(), *in_idx, (*in_idx) + 1, 0);
+      auto &lod_length = lod_and_offset.first;
+
+      framework::AppendLoD(out_lod, lod_length);
+
+      size_t start_offset = lod_and_offset.second.first;
+      size_t end_offset = lod_and_offset.second.second;
+
+      PADDLE_ENFORCE_GE(end_offset, start_offset);
+      size_t len = end_offset - start_offset;
+      if (len == 0) {
+        continue;
+      }
+      auto slice = out->Slice(out_offset, out_offset + len);
+      framework::Copy(input->Slice(start_offset, end_offset), place, dev_ctx,
+                      &slice);
+      out_offset += len;
+      (*in_idx) += 1;
+    }
+
+    for (size_t i = 0; i < level; i++) {
+      out_lod->insert(out_lod->begin(), x.lod()[i]);
+    }
+  }
+};
+
+class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MergeLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input LoDTensor, contains complete lod information to "
+             "construct the output");
+    AddInput("Mask", "A bool column vector which mask the input");
+    AddInput("InTrue", "The True branch to be merged");
+    AddInput("InFalse", "The False branch to be merged");
+    AddOutput("Out", "The merged output LoDTensor");
+    AddAttr<int>("level", "(int) the specific lod level to rank.")
+        .SetDefault(0)
+        .EqualGreaterThan(0);
+    AddComment(
+        R"DOC(
+        Merge True and False branches of LoDTensor into a single Output,
+        with a mask at certain lod level. X is used to obtain complete
+        lod information. Please refer to SplitLoDTensorOp.)DOC");
+  }
+};
+
+class MergeLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "MergeLoDTensorOp must has input X.");
+    PADDLE_ENFORCE(context->HasInput("Mask"),
+                   "MergeLoDTensorOp must has input Mask.");
+    PADDLE_ENFORCE(context->HasInput("InTrue"),
+                   "MergeLoDTensorOp must has input InTrue.");
+    PADDLE_ENFORCE(context->HasInput("InFalse"),
+                   "MergeLoDTensorOp must has input InFalse.");
+    PADDLE_ENFORCE(context->HasOutput("Out"),
+                   "MergeLoDTensorOp must has output Out");
+
+    auto mask_dim = context->GetInputDim("Mask");
+    PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
+    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+
+    context->SetOutputDim("Out", context->GetInputDim("InTrue"));
+  }
+};
+
+class MergeLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("split_lod_tensor");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetInput("Mask", Input("Mask"));
+    grad_op->SetOutput("OutTrue", InputGrad("InTrue"));
+    grad_op->SetOutput("OutFalse", InputGrad("InFalse"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(merge_lod_tensor, ops::MergeLoDTensorOp,
+                  ops::MergeLoDTensorOpProtoMaker,
+                  ops::MergeLoDTensorInferShape, ops::MergeLoDTensorGradMaker);
diff --git a/paddle/fluid/operators/mine_hard_examples_op.cc b/paddle/fluid/operators/mine_hard_examples_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..73a6c0b679310ac4108a915836b5ed497853b38b
--- /dev/null
+++ b/paddle/fluid/operators/mine_hard_examples_op.cc
@@ -0,0 +1,330 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+enum MiningType { kNone = 0, kMaxNegative, kHardExample };
+
+template <typename T>
+bool SortScoreDescend(const std::pair<float, T>& pair1,
+                      const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+inline bool IsEligibleMining(const MiningType mining_type, const int match_idx,
+                             const float match_dist,
+                             const float neg_dist_threshold) {
+  if (mining_type == MiningType::kMaxNegative) {
+    return match_idx == -1 && match_dist < neg_dist_threshold;
+  } else if (mining_type == MiningType::kHardExample) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+inline MiningType GetMiningType(std::string str) {
+  if (str == "max_negative") {
+    return MiningType::kMaxNegative;
+  } else if (str == "hard_example") {
+    return MiningType::kHardExample;
+  } else {
+    return MiningType::kNone;
+  }
+}
+
+template <typename DeviceContext, typename T>
+class MineHardExamplesKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_cls_loss = ctx.Input<framework::Tensor>("ClsLoss");
+    auto* in_loc_loss = ctx.Input<framework::Tensor>("LocLoss");
+    auto* in_matched_indices = ctx.Input<framework::Tensor>("MatchIndices");
+    auto* in_match_dist = ctx.Input<framework::Tensor>("MatchDist");
+    float neg_pos_ratio = ctx.Attr<float>("neg_pos_ratio");
+    T neg_dist_threshold =
+        static_cast<T>(ctx.Attr<float>("neg_dist_threshold"));
+    int sample_size = ctx.Attr<int>("sample_size");
+    MiningType mining_type =
+        GetMiningType(ctx.Attr<std::string>("mining_type"));
+
+    auto out_neg_indices = ctx.Output<framework::LoDTensor>("NegIndices");
+    auto out_match_indices =
+        ctx.Output<framework::Tensor>("UpdatedMatchIndices");
+
+    framework::Copy(*in_matched_indices, ctx.GetPlace(), out_match_indices);
+
+    int batch_size = in_matched_indices->dims()[0];
+    int prior_num = in_matched_indices->dims()[1];
+
+    auto match_indices = framework::EigenMatrix<int>::From(*in_matched_indices);
+
+    auto match_indices_et =
+        framework::EigenMatrix<int>::From(*out_match_indices);
+
+    auto match_dist = framework::EigenMatrix<T>::From(*in_match_dist);
+
+    const T* cls_loss = in_cls_loss->data<T>();
+    const T* loc_loss = nullptr;
+    if (in_loc_loss) {
+      loc_loss = in_loc_loss->data<T>();
+    }
+
+    std::vector<std::vector<int>> all_neg_indices;
+    std::vector<size_t> batch_starts = {0};
+    for (int n = 0; n < batch_size; ++n) {
+      std::vector<std::pair<T, size_t>> loss_idx;
+      int neg_sel = 0;
+      for (int m = 0; m < prior_num; ++m) {
+        if (IsEligibleMining(mining_type, match_indices(n, m), match_dist(n, m),
+                             neg_dist_threshold)) {
+          T loss = cls_loss[n * prior_num + m];
+          if (mining_type == MiningType::kHardExample && loc_loss != nullptr) {
+            loss = cls_loss[n * prior_num + m] + loc_loss[n * prior_num + m];
+          }
+          loss_idx.push_back(std::make_pair(loss, m));
+          ++neg_sel;
+        }
+      }
+
+      if (mining_type == MiningType::kMaxNegative) {
+        int num_pos = 0;
+        for (int m = 0; m < prior_num; ++m) {
+          if (match_indices(n, m) != -1) ++num_pos;
+        }
+        neg_sel = std::min(static_cast<int>(num_pos * neg_pos_ratio), neg_sel);
+      } else if (mining_type == MiningType::kHardExample) {
+        neg_sel = std::min(sample_size, neg_sel);
+      }
+
+      std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend<size_t>);
+      std::set<int> sel_indices;
+      std::vector<int> neg_indices;
+      std::transform(loss_idx.begin(), loss_idx.begin() + neg_sel,
+                     std::inserter(sel_indices, sel_indices.begin()),
+                     [](std::pair<T, size_t>& l) -> int {
+                       return static_cast<int>(l.second);
+                     });
+
+      if (mining_type == MiningType::kHardExample) {
+        for (int m = 0; m < prior_num; ++m) {
+          if (match_indices(n, m) > -1) {
+            if (sel_indices.find(m) == sel_indices.end()) {
+              match_indices_et(n, m) = -1;
+            }
+          } else {
+            if (sel_indices.find(m) != sel_indices.end()) {
+              neg_indices.push_back(m);
+            }
+          }
+        }
+      } else {
+        neg_indices.resize(sel_indices.size());
+        std::copy(sel_indices.begin(), sel_indices.end(), neg_indices.begin());
+      }
+
+      all_neg_indices.push_back(neg_indices);
+      batch_starts.push_back(batch_starts.back() + neg_indices.size());
+    }
+
+    framework::LoD out_neg_indices_lod;
+    out_neg_indices_lod.emplace_back(batch_starts);
+    int neg_offset = 0;
+    auto neg_data = out_neg_indices->mutable_data<int>(
+        framework::make_ddim({static_cast<int>(batch_starts.back()), 1}),
+        ctx.GetPlace());
+
+    for (auto neg_indices : all_neg_indices) {
+      std::copy(neg_indices.begin(), neg_indices.end(), neg_data + neg_offset);
+      neg_offset += neg_indices.size();
+    }
+    out_neg_indices->set_lod(out_neg_indices_lod);
+    return;
+  }
+};
+
+class MineHardExamplesOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("ClsLoss"),
+                   "Input(ClsLoss) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("MatchIndices"),
+        "Input(MatchIndices) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("MatchDist"),
+        "Input(MatchDist) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("NegIndices"),
+        "Output(NegIndices) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("UpdatedMatchIndices"),
+                   "Output(UpdatedMatchIndices) of MineHardExamplesOp should "
+                   "not be null.");
+
+    auto cls_loss_dims = ctx->GetInputDim("ClsLoss");
+    auto idx_dims = ctx->GetInputDim("MatchIndices");
+    auto dis_dims = ctx->GetInputDim("MatchDist");
+
+    PADDLE_ENFORCE_EQ(cls_loss_dims.size(), 2UL,
+                      "The shape of ClsLoss is [N, Np].");
+    PADDLE_ENFORCE_EQ(idx_dims.size(), 2UL,
+                      "The shape of MatchIndices is [N, Np].");
+    PADDLE_ENFORCE_EQ(dis_dims.size(), 2UL,
+                      "The shape of MatchDist is [N, Np].");
+
+    if (ctx->HasInput("LocLoss")) {
+      auto loc_loss_dims = ctx->GetInputDim("LocLoss");
+      PADDLE_ENFORCE_EQ(loc_loss_dims.size(), 2UL,
+                        "The shape of LocLoss is [N, Np].");
+      PADDLE_ENFORCE_EQ(cls_loss_dims[0], loc_loss_dims[0],
+                        "Batch size of ClsLoss and LocLoss must be the same.");
+      PADDLE_ENFORCE_EQ(
+          cls_loss_dims[1], loc_loss_dims[1],
+          "Prior box number of ClsLoss and LocLoss must be the same.");
+    }
+
+    PADDLE_ENFORCE_EQ(
+        cls_loss_dims[0], idx_dims[0],
+        "Batch size of ClsLoss and MatchIndices must be the same.");
+    PADDLE_ENFORCE_EQ(
+        cls_loss_dims[1], idx_dims[1],
+        "Prior box number of ClsLoss and MatchIndices must be the same.");
+
+    PADDLE_ENFORCE_EQ(cls_loss_dims[0], dis_dims[0],
+                      "Batch size of ClsLoss and MatchDist must be the same.");
+    PADDLE_ENFORCE_EQ(
+        cls_loss_dims[1], idx_dims[1],
+        "Prior box number of ClsLoss and MatchDist must be the same.");
+
+    auto mining_type =
+        GetMiningType(ctx->Attrs().Get<std::string>("mining_type"));
+
+    PADDLE_ENFORCE_NE(mining_type, MiningType::kNone,
+                      "mining_type must be hard_example or max_negative");
+
+    if (mining_type == MiningType::kMaxNegative) {
+      auto neg_pos_ratio = ctx->Attrs().Get<float>("neg_pos_ratio");
+      auto neg_dist_threshold = ctx->Attrs().Get<float>("neg_dist_threshold");
+      PADDLE_ENFORCE_GT(
+          neg_pos_ratio, 0.0f,
+          "neg_pos_ratio must greater than zero in max_negative mode");
+      PADDLE_ENFORCE_GT(
+          neg_dist_threshold, 0.0f,
+          "neg_dist_threshold must greater than zero in max_negative mode");
+    } else if (mining_type == MiningType::kHardExample) {
+      auto sample_size = ctx->Attrs().Get<int>("sample_size");
+      PADDLE_ENFORCE_GT(
+          sample_size, 0,
+          "sample_size must greater than zero in hard_example mode");
+    }
+
+    ctx->SetOutputDim("UpdatedMatchIndices", idx_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("ClsLoss")->type()),
+        ctx.device_context());
+  }
+};
+
+class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MineHardExamplesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "ClsLoss",
+        "(Tensor, default Tensor<float>), The classification loss with shape "
+        "[N, Np], N is the batch size and Np is the number of prior box.");
+    AddInput("LocLoss",
+             "(Tensor, optional, default Tensor<float>), The localization loss "
+             "with shape [N, Np], N is the batch size and Np is the number of "
+             "prior box.")
+        .AsDispensable();
+    AddInput("MatchIndices",
+             "(Tensor, Tensor<int>), Matched indices with shape [N, Np], N is "
+             "the batch size and Np is the number of prior box. "
+             "MatchIndices[i][j] equal -1 means the j-th prior box in i-th "
+             "instance does not match any entity, otherwise means it is "
+             "matched to row.");
+    AddInput("MatchDist",
+             "(Tensor, default Tensor<float>) Matched indices with shape [N, "
+             "Np], N is the batch size and Np is the number of prior box.");
+    AddAttr<float>("neg_pos_ratio",
+                   "(float) The ratio of the negative box to the positive "
+                   "box. Use only when mining_type is max_negative.")
+        .SetDefault(1.0);
+    AddAttr<float>("neg_dist_threshold",
+                   "(float) The negative overlap upper bound for the unmatched "
+                   "predictions. Use only when mining_type is max_negative.")
+        .SetDefault(0.5);
+    AddAttr<int>("sample_size",
+                 "(float) The max sample size of negative box. Use only when "
+                 "mining_type is hard_example.")
+        .SetDefault(0);
+    AddAttr<std::string>("mining_type",
+                         "(float) The mining algorithm name, the value is "
+                         "hard_example or max_negative.")
+        .SetDefault("max_negative")
+        .InEnum({"hard_example", "max_negative"});
+
+    AddOutput(
+        "NegIndices",
+        "(LoDTensor<int>) The output of negative example indices. a LoDTensor "
+        "with shape [Neg, 1]. The size of lod[0] minus 1 is batch size, "
+        "and each element is the prior box index. "
+        "For example, the batch size is 2, the lod is [[0, 1, 2]], "
+        "the sample 0's box 1(MatchIndices[0][1]) is selected, "
+        "and sample 1's box 0 is selected. The output NegIndices is "
+        "[[1], [0]].");
+
+    AddOutput("UpdatedMatchIndices",
+              "(Tensor<int>) The output of updated MatchIndices, a tensor with "
+              "shape [N, Np]. Only update when mining_type is "
+              "hard_example. The input MatchIndices elements will be update to "
+              "-1 when it is not in the candidate high loss list of negative "
+              "examples.");
+
+    AddComment(R"DOC(
+Mine hard examples Operator.
+This operator implements hard example mining to select a subset of negative box indices.
+For each image, selects the box with highest losses. subject to the condition that the 
+box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative. 
+The selected number is min(sample_size, max_negative_box_number) when mining_type is 
+hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number) 
+when mining_type is max_negative, where the max_negative_box_number is the count of 
+MatchIndices elements with value -1.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(mine_hard_examples, ops::MineHardExamplesOp,
+                             ops::MineHardExamplesOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    mine_hard_examples,
+    ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8a35d668ccfa8a95cd41c4a943aad6ff915cc7dd
--- /dev/null
+++ b/paddle/fluid/operators/minus_op.cc
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/minus_op.h"
+#include "paddle/fluid/operators/net_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MinusOp : public framework::OperatorWithKernel {
+ public:
+  MinusOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of MinusOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of MinusOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of MinusOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_EQ(
+        x_dims, y_dims,
+        "Minus operator must take two tensor with same num of elements");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The left tensor of minus operator.");
+    AddInput("Y", "The right tensor of minus operator.");
+    AddOutput("Out", "The output tensor of minus operator.");
+
+    AddComment(R"DOC(
+Minus Operator.
+
+Equation:
+
+    $Out = X - Y$
+
+Both the input `X` and `Y` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD information with input `X`.
+
+)DOC");
+  }
+};
+
+class MinusGradMaker : public framework::GradOpDescMakerBase {
+ public:
+  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
+
+  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
+    std::vector<std::unique_ptr<framework::OpDesc>> ops;
+    auto x_g = InputGrad("X");
+    if (!x_g.empty()) {
+      auto *x_g_op = new framework::OpDesc();
+      x_g_op->SetType("scale");
+      x_g_op->SetInput("X", OutputGrad("Out"));
+      x_g_op->SetOutput("Out", x_g);
+      x_g_op->SetAttr("scale", 1.0f);
+      ops.emplace_back(x_g_op);
+    }
+
+    auto y_g = InputGrad("Y");
+    if (!y_g.empty()) {
+      auto *y_g_op = new framework::OpDesc();
+      y_g_op->SetType("scale");
+      y_g_op->SetInput("X", OutputGrad("Out"));
+      y_g_op->SetOutput("Out", y_g);
+      y_g_op->SetAttr("scale", -1.0f);
+      ops.emplace_back(y_g_op);
+    }
+
+    return ops;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(minus, ops::MinusOp, ops::MinusOpMaker, ops::MinusGradMaker);
+REGISTER_OP_CPU_KERNEL(
+    minus, ops::MinusKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/minus_op.cu b/paddle/fluid/operators/minus_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ce0b1fdc0419fead915511581270fb9984df9dc5
--- /dev/null
+++ b/paddle/fluid/operators/minus_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/minus_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    minus,
+    paddle::operators::MinusKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/minus_op.h b/paddle/fluid/operators/minus_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc94cbbeca264536669716beb5318432ba9689a4
--- /dev/null
+++ b/paddle/fluid/operators/minus_op.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MinusKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* left_tensor = context.Input<framework::Tensor>("X");
+    auto* right_tensor = context.Input<framework::Tensor>("Y");
+    auto* out_tensor = context.Output<framework::Tensor>("Out");
+
+    out_tensor->mutable_data<T>(context.GetPlace());
+    auto& dev =
+        *context.template device_context<DeviceContext>().eigen_device();
+    framework::EigenVector<T>::Flatten(*out_tensor).device(dev) =
+        framework::EigenVector<T>::Flatten(*left_tensor) -
+        framework::EigenVector<T>::Flatten(*right_tensor);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f2d16531658d42556756e75895b7250a529f13df
--- /dev/null
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/modified_huber_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ModifiedHuberLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same.");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The tensor rank of X must be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[1], 1, "The 2nd dimension of X must be 1.");
+
+    ctx->SetOutputDim("IntermediateVal", x_dims);
+    ctx->SetOutputDim("Out", {x_dims[0], 1});
+  }
+};
+
+class ModifiedHuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ModifiedHuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input tensor of modified huber loss op. "
+             "X is 2-D tensor with shape [batch_size, 1].");
+    AddInput("Y",
+             "The target labels of modified huber loss op. "
+             "The shape of Y is the same as X. Values of Y must be 0 or 1.");
+    AddOutput("IntermediateVal",
+              "Variable to save intermediate result which will be reused in "
+              "backward processing.")
+        .AsIntermediate();
+    AddOutput("Out", "Classification loss for X.");
+    AddComment(R"DOC(
+Modified Huber Loss Operator.
+
+This operator is used in binary classification problem. The shape of
+input X and target Y are both [N, 1] and so is the shape of the output loss.
+Since target Y is not differentiable, calculating gradient for Y is illegal.
+The formula of modified huber loss is:
+
+$$
+L(y, f(x)) = 
+\begin{cases}
+(\max(0, 1 - yf(x)))^2,  \text{if} \  yf(x) >= -1    \\
+             -4yf(x),    \quad \text{otherwise}
+\end{cases}
+$$
+
+Make sure the values of target label Y are in {0, 1} here. This operator will
+scale values of Y to {-1, +1} when computing losses and gradients.
+
+)DOC");
+  }
+};
+
+class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("IntermediateVal"),
+                   "Intermediate value must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@Grad) must not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto intermediate_dims = ctx->GetInputDim("IntermediateVal");
+    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_EQ(
+        intermediate_dims, x_dims,
+        "The shape of X and intermediate value must be the same.");
+    PADDLE_ENFORCE_EQ(out_grad_dims, x_dims,
+                      "The shape of Input(Out@Grad) and X must be the same.");
+
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(modified_huber_loss, ops::ModifiedHuberLossOp,
+            ops::ModifiedHuberLossOpMaker, modified_huber_loss_grad,
+            ops::ModifiedHuberLossGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    modified_huber_loss,
+    ops::ModifiedHuberLossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(modified_huber_loss_grad,
+                       ops::ModifiedHuberLossGradCPUKernel<float>);
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..69ac2b1ed546a4755cd4e8d52a7f8b98b4f0e7b9
--- /dev/null
+++ b/paddle/fluid/operators/modified_huber_loss_op.cu
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <thrust/for_each.h>
+#include <thrust/tuple.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/modified_huber_loss_op.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+struct ModifiedHuberLossBackward {
+  template <typename Tuple>
+  HOSTDEVICE void operator()(Tuple t) const {
+    auto inter_val = thrust::get<1>(t);
+    auto y_val = thrust::get<2>(t);
+    auto out_grad = thrust::get<3>(t);
+    if (inter_val < -1) {
+      thrust::get<0>(t) = -4 * (2 * y_val - 1) * out_grad;
+    } else if (inter_val < 1) {
+      thrust::get<0>(t) = -2 * (1 - inter_val) * (2 * y_val - 1) * out_grad;
+    } else {
+      thrust::get<0>(t) = 0;
+    }
+  }
+};
+
+template <typename T>
+class ModifiedHuberLossGradGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("Y");
+    auto* in1 = context.Input<Tensor>("IntermediateVal");
+    auto* in2 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+
+    if (out0) {
+      auto counts = framework::product(in1->dims());
+      auto y_ptr = thrust::device_pointer_cast(in0->data<T>());
+      auto inter_val_ptr = thrust::device_pointer_cast(in1->data<T>());
+      auto out_grad_ptr = thrust::device_pointer_cast(in2->data<T>());
+      thrust::device_ptr<T> x_grad_ptr(
+          out0->mutable_data<T>(context.GetPlace()));
+
+      auto iter_begin = thrust::make_zip_iterator(
+          thrust::make_tuple(x_grad_ptr, inter_val_ptr, y_ptr, out_grad_ptr));
+
+      auto iter_end = thrust::make_zip_iterator(
+          thrust::make_tuple(x_grad_ptr + counts, inter_val_ptr + counts,
+                             y_ptr + counts, out_grad_ptr + counts));
+
+      thrust::for_each(iter_begin, iter_end, ModifiedHuberLossBackward());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    modified_huber_loss,
+    ops::ModifiedHuberLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(modified_huber_loss_grad,
+                        ops::ModifiedHuberLossGradGPUKernel<float>);
diff --git a/paddle/fluid/operators/modified_huber_loss_op.h b/paddle/fluid/operators/modified_huber_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a470a45e13b55a33a8485cac2038ce1cc761a3f3
--- /dev/null
+++ b/paddle/fluid/operators/modified_huber_loss_op.h
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename T>
+struct CheckLabelValue {
+  HOSTDEVICE T operator()(const T& val) const {
+    PADDLE_ASSERT(val == static_cast<T>(0) || val == static_cast<T>(1));
+  }
+};
+
+template <typename T>
+struct ModifiedHuberLossForward {
+  HOSTDEVICE T operator()(const T& val) const {
+    if (val < -1) {
+      return -4 * val;
+    } else if (val < 1) {
+      return (1 - val) * (1 - val);
+    } else {
+      return static_cast<T>(0);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ModifiedHuberLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto* in1 = context.Input<Tensor>("Y");
+    auto* out0 = context.Output<framework::Tensor>("IntermediateVal");
+    auto* out1 = context.Output<framework::Tensor>("Out");
+
+    out0->mutable_data<T>(context.GetPlace());
+    out1->mutable_data<T>(context.GetPlace());
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    auto x = EigenVector<T>::Flatten(*in0);
+    auto y = EigenVector<T>::Flatten(*in1);
+    // make sure value's of Y in {0, 1}
+    y.unaryExpr(CheckLabelValue<T>());
+    auto inter_val = EigenVector<T>::Flatten(*out0);
+    // scale y to {-1, +1} and compute x * y
+    inter_val.device(place) = x * (2 * y - static_cast<T>(1));
+    auto loss = EigenVector<T>::Flatten(*out1);
+    loss.device(place) = inter_val.unaryExpr(ModifiedHuberLossForward<T>());
+  }
+};
+
+// CPU backward kernel
+template <typename T>
+class ModifiedHuberLossGradCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("Y");
+    auto* in1 = context.Input<framework::Tensor>("IntermediateVal");
+    auto* in2 = context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    if (out0) {
+      const T* y_ptr = in0->data<T>();
+      const T* inter_val_ptr = in1->data<T>();
+      const T* out_grad_ptr = in2->data<T>();
+      size_t counts = static_cast<size_t>(framework::product(in1->dims()));
+      T* x_grad_ptr = out0->mutable_data<T>(context.GetPlace());
+      for (size_t i = 0; i < counts; ++i) {
+        if (inter_val_ptr[i] < -1) {
+          x_grad_ptr[i] = -4 * (2 * y_ptr[i] - 1) * out_grad_ptr[i];
+        } else if (inter_val_ptr[i] < 1) {
+          x_grad_ptr[i] = -2 * (1 - inter_val_ptr[i]) * (2 * y_ptr[i] - 1) *
+                          out_grad_ptr[i];
+        } else {
+          x_grad_ptr[i] = 0;
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a3950ac99da93687596f62c6f020f4add1fb04ba
--- /dev/null
+++ b/paddle/fluid/operators/momentum_op.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/momentum_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MomentumOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(param) of Momentum should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(grad) of Momentum should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Velocity"),
+                   "Input(velocity) of Momentum should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of Momentum should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of Momentum should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"),
+                   "Output(VelocityOut) of Momentum should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("Grad"),
+        "Param and Grad input of MomentumOp should have the same dimension.");
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("Velocity"),
+        "Param and Velocity of MomentumOp should have the same dimension.");
+    PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1,
+                      "Learning_rate should be a scalar");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("VelocityOut", param_dim);
+  }
+};
+
+class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MomentumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param",
+             "(Tensor, default Tensor<float>) "
+             "Input parameter that has to be updated");
+    AddInput("Grad",
+             "(Tensor, default Tensor<float>) "
+             "Input gradient of the parameter");
+    AddInput("Velocity",
+             "(Tensor, default Tensor<float>) "
+             "Input velocity (corresponding to the parameter) "
+             "that has to be updated");
+    AddInput("LearningRate",
+             "(Tensor, default Tensor<float>) "
+             "Input learning rate");
+
+    AddOutput("ParamOut",
+              "(Tensor) This output is updated parameter. "
+              "It shared memory with Input(Param).");
+    AddOutput("VelocityOut",
+              "(Tensor) This output is updated velocity. "
+              "It shared memory with Input(Velocity).");
+
+    AddAttr<float>("mu", "(float) Momentum coefficient");
+    AddAttr<bool>("use_nesterov",
+                  "(bool, default false) "
+                  "Use Nesterov Momentum")
+        .SetDefault(false);
+    AddComment(R"DOC(
+Momentum Optimizer.
+
+This optimizer has a flag for Nestrov Momentum.
+The update equations are as follows:
+
+$$
+velocity = mu * velocity + gradient \\
+if (use\_nesterov):   \\
+  param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\
+else:   \\
+  param = param - learning\_rate * velocity. \\
+$$
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(momentum, ops::MomentumOp, ops::MomentumOpMaker);
+REGISTER_OP_CPU_KERNEL(momentum, ops::MomentumOpKernel<float>,
+                       ops::MomentumOpKernel<double>);
diff --git a/paddle/fluid/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..28a14cd4b219b86ba4922a3485b99c2c861a74d9
--- /dev/null
+++ b/paddle/fluid/operators/momentum_op.cu
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void MomentumKernel(const T* p, const T* g, const T* v,
+                               const T* learning_rate, const T mu,
+                               const int64_t num, bool use_nesterov, T* p_out,
+                               T* v_out) {
+  T lr = learning_rate[0];
+  if (use_nesterov) {
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+         i += blockDim.x * gridDim.x) {
+      T g_val = g[i];
+      T v_new = v[i] * mu + g_val;
+      v_out[i] = v_new;
+      p_out[i] = p[i] - (g_val - v_new * mu) * lr;
+    }
+  } else {
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+         i += blockDim.x * gridDim.x) {
+      T v_new = v[i] * mu + g[i];
+      v_out[i] = v_new;
+      p_out[i] = p[i] - lr * v_new;
+    }
+  }
+}
+
+template <typename T>
+class MomentumOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
+    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
+    auto param = ctx.Input<framework::Tensor>("Param");
+    auto velocity = ctx.Input<framework::Tensor>("Velocity");
+    auto grad = ctx.Input<framework::Tensor>("Grad");
+    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    T* p_out = param_out->mutable_data<T>(ctx.GetPlace());
+    T* v_out = velocity_out->mutable_data<T>(ctx.GetPlace());
+
+    T mu = static_cast<T>(ctx.Attr<float>("mu"));
+    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
+
+    auto* p = param->data<T>();
+    auto* v = velocity->data<T>();
+    auto* g = grad->data<T>();
+    auto* lr = learning_rate->data<T>();
+
+    int block = 512;
+    int grid = (param->numel() + block - 1) / block;
+    MomentumKernel<T><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
+        p, g, v, lr, mu, param->numel(), use_nesterov, p_out, v_out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(momentum, ops::MomentumOpCUDAKernel<float>,
+                        ops::MomentumOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/momentum_op.h b/paddle/fluid/operators/momentum_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..fdab86b24eefe15b85f4ca6a49e54ea67c1e7bdf
--- /dev/null
+++ b/paddle/fluid/operators/momentum_op.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class MomentumOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
+    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
+    auto param = ctx.Input<framework::Tensor>("Param");
+    auto velocity = ctx.Input<framework::Tensor>("Velocity");
+    auto grad = ctx.Input<framework::Tensor>("Grad");
+    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+    velocity_out->mutable_data<T>(ctx.GetPlace());
+
+    T mu = static_cast<T>(ctx.Attr<float>("mu"));
+    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
+
+    auto p_out = framework::EigenVector<T>::Flatten(*param_out);
+    auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
+
+    auto p = framework::EigenVector<T>::Flatten(*param);
+    auto v = framework::EigenVector<T>::Flatten(*velocity);
+    auto g = framework::EigenVector<T>::Flatten(*grad);
+    auto* lr = learning_rate->data<T>();
+
+    v_out = v * mu + g;
+    if (use_nesterov) {
+      p_out = p - (g - v_out * mu) * lr[0];
+    } else {
+      p_out = p - lr[0] * v_out;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c9375d8ea1297f5201b6294f61119f6b5603d988
--- /dev/null
+++ b/paddle/fluid/operators/mul_op.cc
@@ -0,0 +1,166 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mul_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class MulOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of MulOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    int x_num_col_dims = ctx->Attrs().Get<int>("x_num_col_dims");
+    int y_num_col_dims = ctx->Attrs().Get<int>("y_num_col_dims");
+
+    VLOG(3) << "mul operator x.shape=" << x_dims << " y.shape=" << y_dims
+            << " x_num_col_dims=" << x_num_col_dims
+            << " y_num_col_dims=" << y_num_col_dims;
+
+    PADDLE_ENFORCE_GT(
+        x_dims.size(), x_num_col_dims,
+        "The input tensor X's rank of MulOp should be larger than "
+        "x_num_col_dims.");
+    PADDLE_ENFORCE_GT(
+        y_dims.size(), y_num_col_dims,
+        "The input tensor Y's rank of MulOp should be larger than "
+        "y_num_col_dims.");
+
+    auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
+    auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
+
+    PADDLE_ENFORCE_EQ(
+        x_mat_dims[1], y_mat_dims[0],
+        "First matrix's width must be equal with second matrix's height.");
+    std::vector<int64_t> output_dims;
+    output_dims.reserve(
+        static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
+
+    for (int i = 0; i < x_num_col_dims; ++i) {
+      output_dims.push_back(x_dims[i]);
+    }
+
+    for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
+      output_dims.push_back(y_dims[i]);
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor), The first input tensor of mul op.");
+    AddInput("Y", "(Tensor), The second input tensor of mul op.");
+    AddOutput("Out", "(Tensor), The output tensor of mul op.");
+    AddAttr<int>(
+        "x_num_col_dims",
+        R"DOC((int, default 1), The mul_op can take tensors with more than two
+              dimensions as its inputs. If the input $X$ is a tensor with more
+              than two dimensions, $X$ will be flattened into a two-dimensional
+              matrix first. The flattening rule is: the first `num_col_dims`
+              will be flattened to form the first dimension of the final matrix
+              (the height of the matrix), and the rest `rank(X) - num_col_dims`
+              dimensions are flattened to form the second dimension of the final
+              matrix (the width of the matrix). As a result, height of the
+              flattened matrix is equal to the product of $X$'s first
+              `x_num_col_dims` dimensions' sizes, and width of the flattened
+              matrix is equal to the product of $X$'s last `rank(x) - num_col_dims`
+              dimensions' size. For example, suppose $X$ is a 6-dimensional
+              tensor with the shape [2, 3, 4, 5, 6], and `x_num_col_dims` = 3.
+              Thus, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] =
+              [24, 30].
+        )DOC")
+        .SetDefault(1)
+        .EqualGreaterThan(1);
+    AddAttr<int>(
+        "y_num_col_dims",
+        R"DOC((int, default 1), The mul_op can take tensors with more than two,
+              dimensions as its inputs. If the input $Y$ is a tensor with more
+              than two dimensions, $Y$ will be flattened into a two-dimensional
+              matrix first. The attribute `y_num_col_dims` determines how $Y$ is
+              flattened. See comments of `x_num_col_dims` for more details.
+        )DOC")
+        .SetDefault(1)
+        .EqualGreaterThan(1);
+    AddComment(R"DOC(
+Mul Operator.
+
+This operator is used to perform matrix multiplication for input $X$ and $Y$.
+
+The equation is:
+
+$$Out = X * Y$$
+
+Both the input $X$ and $Y$ can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD information with input $X$.
+
+)DOC");
+  }
+};
+
+class MulOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    auto x_mat_dims = framework::flatten_to_2d(
+        x_dims, ctx->Attrs().Get<int>("x_num_col_dims"));
+    auto y_mat_dims = framework::flatten_to_2d(
+        y_dims, ctx->Attrs().Get<int>("y_num_col_dims"));
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(mul, paddle::framework::OperatorWithKernel, ops::MulOpMaker,
+                  ops::MulOpShapeInference,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(mul_grad, ops::MulOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/mul_op.cu.cc b/paddle/fluid/operators/mul_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6f605fd84fb802a079f9ed13afc032cc2d6c2b0c
--- /dev/null
+++ b/paddle/fluid/operators/mul_op.cu.cc
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mul_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    mul_grad, ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..745989f07f3646ab5d59f3d292030f3eb52dac49
--- /dev/null
+++ b/paddle/fluid/operators/mul_op.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/math/math_function.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class MulKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    const Tensor* y = context.Input<Tensor>("Y");
+    Tensor* z = context.Output<Tensor>("Out");
+    const Tensor x_matrix =
+        x->dims().size() > 2
+            ? framework::ReshapeToMatrix(
+                  *x, context.template Attr<int>("x_num_col_dims"))
+            : *x;
+    const Tensor y_matrix =
+        y->dims().size() > 2
+            ? framework::ReshapeToMatrix(
+                  *y, context.template Attr<int>("y_num_col_dims"))
+            : *y;
+
+    z->mutable_data<T>(context.GetPlace());
+    auto z_dim = z->dims();
+    if (z_dim.size() != 2) {
+      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+    }
+    math::matmul<DeviceContext, T>(
+        context.template device_context<DeviceContext>(), x_matrix, false,
+        y_matrix, false, 1, z, 0);
+    if (z_dim.size() != 2) {
+      z->Resize(z_dim);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MulGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
+    int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* y = ctx.Input<Tensor>("Y");
+    const Tensor x_matrix = x->dims().size() > 2
+                                ? framework::ReshapeToMatrix(*x, x_num_col_dims)
+                                : *x;
+    const Tensor y_matrix = y->dims().size() > 2
+                                ? framework::ReshapeToMatrix(*y, y_num_col_dims)
+                                : *y;
+    const Tensor* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    Tensor dout_mat;
+    dout_mat.ShareDataWith(*dout);
+    dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0],
+                     framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
+
+    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      Tensor dx_matrix = dx->dims().size() > 2
+                             ? framework::ReshapeToMatrix(*dx, x_num_col_dims)
+                             : *dx;
+
+      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
+      math::matmul<DeviceContext, T>(dev_ctx, dout_mat, false, y_matrix, true,
+                                     1, &dx_matrix, 0);
+    }
+    if (dy) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      Tensor dy_matrix = dy->dims().size() > 2
+                             ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
+                             : *dy;
+      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
+      math::matmul<DeviceContext, T>(dev_ctx, x_matrix, true, dout_mat, false,
+                                     1, &dy_matrix, 0);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/multiclass_nms_op.cc b/paddle/fluid/operators/multiclass_nms_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b2934f69cc9b2e50bdd5cbdf04deeaf5ca120e2c
--- /dev/null
+++ b/paddle/fluid/operators/multiclass_nms_op.cc
@@ -0,0 +1,384 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+constexpr int64_t kOutputDim = 6;
+constexpr int64_t kBBoxSize = 4;
+
+class MultiClassNMSOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("BBoxes"),
+                   "Input(BBoxes) of MultiClassNMS should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Scores"),
+                   "Input(Scores) of MultiClassNMS should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of MultiClassNMS should not be null.");
+
+    auto box_dims = ctx->GetInputDim("BBoxes");
+    auto score_dims = ctx->GetInputDim("Scores");
+
+    PADDLE_ENFORCE_EQ(box_dims.size(), 2,
+                      "The rank of Input(BBoxes) must be 2.");
+    PADDLE_ENFORCE_EQ(score_dims.size(), 3,
+                      "The rank of Input(Scores) must be 3.");
+    PADDLE_ENFORCE_EQ(box_dims[1], 4,
+                      "The 2nd dimension of Input(BBoxes) must be 4, "
+                      "represents the layout of coordinate "
+                      "[xmin, ymin, xmax, ymax]");
+    PADDLE_ENFORCE_EQ(box_dims[0], score_dims[2],
+                      "The 1st dimensiong of Input(BBoxes) must be equal to "
+                      "3rd dimension of Input(Scores), which represents the "
+                      "predicted bboxes.");
+
+    // Here the box_dims[0] is not the real dimension of output.
+    // It will be rewritten in the computing kernel.
+    ctx->SetOutputDim("Out", {box_dims[0], 6});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>("Scores")->type()),
+        ctx.device_context());
+  }
+};
+
+template <class T>
+bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <class T>
+static inline void GetMaxScoreIndex(
+    const std::vector<T>& scores, const T threshold, int top_k,
+    std::vector<std::pair<T, int>>* sorted_indices) {
+  for (size_t i = 0; i < scores.size(); ++i) {
+    if (scores[i] > threshold) {
+      sorted_indices->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
+                   SortScorePairDescend<int>);
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+    sorted_indices->resize(top_k);
+  }
+}
+
+template <class T>
+static inline T BBoxArea(const T* box, const bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <class T>
+static inline T JaccardOverlap(const T* box1, const T* box2,
+                               const bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    const T inter_w = inter_xmax - inter_xmin;
+    const T inter_h = inter_ymax - inter_ymin;
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <typename T>
+class MultiClassNMSKernel : public framework::OpKernel<T> {
+ public:
+  void NMSFast(const Tensor& bbox, const Tensor& scores,
+               const T score_threshold, const T nms_threshold, const T eta,
+               const int64_t top_k, std::vector<int>* selected_indices) const {
+    // The total boxes for each instance.
+    int64_t num_boxes = bbox.dims()[0];
+    // 4: [xmin ymin xmax ymax]
+    int64_t box_size = bbox.dims()[1];
+
+    std::vector<T> scores_data(num_boxes);
+    std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
+    std::vector<std::pair<T, int>> sorted_indices;
+    GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
+
+    selected_indices->clear();
+    T adaptive_threshold = nms_threshold;
+    const T* bbox_data = bbox.data<T>();
+
+    while (sorted_indices.size() != 0) {
+      const int idx = sorted_indices.front().second;
+      bool keep = true;
+      for (size_t k = 0; k < selected_indices->size(); ++k) {
+        if (keep) {
+          const int kept_idx = (*selected_indices)[k];
+          T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                        bbox_data + kept_idx * box_size, true);
+          keep = overlap <= adaptive_threshold;
+        } else {
+          break;
+        }
+      }
+      if (keep) {
+        selected_indices->push_back(idx);
+      }
+      sorted_indices.erase(sorted_indices.begin());
+      if (keep && eta < 1 && adaptive_threshold > 0.5) {
+        adaptive_threshold *= eta;
+      }
+    }
+  }
+
+  void MultiClassNMS(const framework::ExecutionContext& ctx,
+                     const Tensor& scores, const Tensor& bboxes,
+                     std::map<int, std::vector<int>>& indices,
+                     int& num_nmsed_out) const {
+    int64_t background_label = ctx.Attr<int>("background_label");
+    int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
+    int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
+    T nms_threshold = static_cast<T>(ctx.Attr<float>("nms_threshold"));
+    T nms_eta = static_cast<T>(ctx.Attr<float>("nms_eta"));
+    T score_threshold = static_cast<T>(ctx.Attr<float>("score_threshold"));
+
+    int64_t class_num = scores.dims()[0];
+    int64_t predict_dim = scores.dims()[1];
+    int num_det = 0;
+    for (int64_t c = 0; c < class_num; ++c) {
+      if (c == background_label) continue;
+      Tensor score = scores.Slice(c, c + 1);
+      NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k,
+              &(indices[c]));
+      num_det += indices[c].size();
+    }
+
+    num_nmsed_out = num_det;
+    const T* scores_data = scores.data<T>();
+    if (keep_top_k > -1 && num_det > keep_top_k) {
+      std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+      for (const auto& it : indices) {
+        int label = it.first;
+        const T* sdata = scores_data + label * predict_dim;
+        const std::vector<int>& label_indices = it.second;
+        for (size_t j = 0; j < label_indices.size(); ++j) {
+          int idx = label_indices[j];
+          PADDLE_ENFORCE_LT(idx, predict_dim);
+          score_index_pairs.push_back(
+              std::make_pair(sdata[idx], std::make_pair(label, idx)));
+        }
+      }
+      // Keep top k results per image.
+      std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
+                       SortScorePairDescend<std::pair<int, int>>);
+      score_index_pairs.resize(keep_top_k);
+
+      // Store the new indices.
+      std::map<int, std::vector<int>> new_indices;
+      for (size_t j = 0; j < score_index_pairs.size(); ++j) {
+        int label = score_index_pairs[j].second.first;
+        int idx = score_index_pairs[j].second.second;
+        new_indices[label].push_back(idx);
+      }
+      new_indices.swap(indices);
+      num_nmsed_out = keep_top_k;
+    }
+  }
+
+  void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
+                        std::map<int, std::vector<int>>& selected_indices,
+                        Tensor* outs) const {
+    int predict_dim = scores.dims()[1];
+    auto* scores_data = scores.data<T>();
+    auto* bboxes_data = bboxes.data<T>();
+    auto* odata = outs->data<T>();
+
+    int count = 0;
+    for (const auto& it : selected_indices) {
+      int label = it.first;
+      const T* sdata = scores_data + label * predict_dim;
+      const std::vector<int>& indices = it.second;
+      for (size_t j = 0; j < indices.size(); ++j) {
+        int idx = indices[j];
+        const T* bdata = bboxes_data + idx * kBBoxSize;
+        odata[count * kOutputDim] = label;           // label
+        odata[count * kOutputDim + 1] = sdata[idx];  // score
+        // xmin, ymin, xmax, ymax
+        std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T));
+        count++;
+      }
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* boxes = ctx.Input<Tensor>("BBoxes");
+    auto* scores = ctx.Input<Tensor>("Scores");
+    auto* outs = ctx.Output<LoDTensor>("Out");
+
+    auto score_dims = scores->dims();
+
+    int64_t batch_size = score_dims[0];
+    int64_t class_num = score_dims[1];
+    int64_t predict_dim = score_dims[2];
+
+    std::vector<std::map<int, std::vector<int>>> all_indices;
+    std::vector<size_t> batch_starts = {0};
+    for (int64_t i = 0; i < batch_size; ++i) {
+      Tensor ins_score = scores->Slice(i, i + 1);
+      ins_score.Resize({class_num, predict_dim});
+      std::map<int, std::vector<int>> indices;
+      int num_nmsed_out = 0;
+      MultiClassNMS(ctx, ins_score, *boxes, indices, num_nmsed_out);
+      all_indices.push_back(indices);
+      batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+    }
+
+    int num_kept = batch_starts.back();
+    if (num_kept == 0) {
+      T* od = outs->mutable_data<T>({1}, ctx.GetPlace());
+      od[0] = -1;
+    } else {
+      outs->mutable_data<T>({num_kept, kOutputDim}, ctx.GetPlace());
+      for (int64_t i = 0; i < batch_size; ++i) {
+        Tensor ins_score = scores->Slice(i, i + 1);
+        ins_score.Resize({class_num, predict_dim});
+        int64_t s = batch_starts[i];
+        int64_t e = batch_starts[i + 1];
+        if (e > s) {
+          Tensor out = outs->Slice(s, e);
+          MultiClassOutput(ins_score, *boxes, all_indices[i], &out);
+        }
+      }
+    }
+
+    framework::LoD lod;
+    lod.emplace_back(batch_starts);
+
+    outs->set_lod(lod);
+  }
+};
+
+class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MultiClassNMSOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("BBoxes",
+             "(Tensor) A 2-D Tensor with shape [M, 4] represents the "
+             "predicted locations of M bounding bboxes. Each bounding box "
+             "has four coordinate values and the layout is "
+             "[xmin, ymin, xmax, ymax].");
+    AddInput("Scores",
+             "(Tensor) A 3-D Tensor with shape [N, C, M] represents the "
+             "predicted confidence predictions. N is the batch size, C is the "
+             "class number, M is number of bounding boxes. For each category "
+             "there are total M scores which corresponding M bounding boxes. "
+             " Please note, M is equal to the 1st dimension of BBoxes. ");
+    AddAttr<int>(
+        "background_label",
+        "(int64_t, defalut: 0) "
+        "The index of background label, the background label will be ignored. "
+        "If set to -1, then all categories will be considered.")
+        .SetDefault(0);
+    AddAttr<float>("score_threshold",
+                   "(float) "
+                   "Threshold to filter out bounding boxes with low "
+                   "confidence score. If not provided, consider all boxes.");
+    AddAttr<int>("nms_top_k",
+                 "(int64_t) "
+                 "Maximum number of detections to be kept according to the "
+                 "confidences aftern the filtering detections based on "
+                 "score_threshold");
+    AddAttr<float>("nms_threshold",
+                   "(float, defalut: 0.3) "
+                   "The threshold to be used in NMS.")
+        .SetDefault(0.3);
+    AddAttr<float>("nms_eta",
+                   "(float) "
+                   "The parameter for adaptive NMS.")
+        .SetDefault(1.0);
+    AddAttr<int>("keep_top_k",
+                 "(int64_t) "
+                 "Number of total bboxes to be kept per image after NMS "
+                 "step. -1 means keeping all bboxes after NMS step.");
+    AddOutput("Out",
+              "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the "
+              "detections. Each row has 6 values: "
+              "[label, confidence, xmin, ymin, xmax, ymax], No is the total "
+              "number of detections in this mini-batch. For each instance, "
+              "the offsets in first dimension are called LoD, the number of "
+              "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
+              "no detected bbox.");
+    AddComment(R"DOC(
+This operator is to do multi-class non maximum suppression (NMS) on a batched
+of boxes and scores.
+
+In the NMS step, this operator greedily selects a subset of detection bounding
+boxes that have high scores larger than score_threshold, if providing this
+threshold, then selects the largest nms_top_k confidences scores if nms_top_k
+is larger than -1. Then this operator pruns away boxes that have high IOU
+(intersection over union) overlap with already selected boxes by adaptive
+threshold NMS based on parameters of nms_threshold and nms_eta.
+
+Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
+per image if keep_top_k is larger than -1.
+
+This operator support multi-class and batched inputs. It applying NMS
+independently for each class. The outputs is a 2-D LoDTenosr, for each
+image, the offsets in first dimension of LoDTensor are called LoD, the number
+of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
+means there is no detected bbox for this image. If there is no detected boxes
+for all images, all the elements in LoD are 0, and the Out only contains one
+value which is -1.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp,
+                  ops::MultiClassNMSOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(multiclass_nms, ops::MultiClassNMSKernel<float>,
+                       ops::MultiClassNMSKernel<double>);
diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f89b00376ba7a759419fa60efe80575b6a8d1f2e
--- /dev/null
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -0,0 +1,131 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/multiplex_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class MultiplexOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Ids"), "Input(Ids) shouldn't be null.");
+    PADDLE_ENFORCE(!ctx->Inputs("X").empty(),
+                   "MultiInput(X) shouldn't be empty.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null.");
+    auto ids_dim = ctx->GetInputDim("Ids");
+    PADDLE_ENFORCE(
+        ids_dim.size() == 2 && ids_dim[1] == 1,
+        "The index tensor must be a vector with size batchSize x 1.");
+
+    auto ins_dims = ctx->GetInputsDim("X");
+    auto num_ins = ins_dims.size();
+    PADDLE_ENFORCE(num_ins > 1,
+                   "multiplex operator should have more than "
+                   "one candidate input tensors.");
+
+    auto in_dim = ins_dims[0];
+    PADDLE_ENFORCE(in_dim.size() >= 2,
+                   "The rank of candidate tensors must be not less than 2.");
+    for (size_t i = 1; i < num_ins; i++) {
+      auto dim = ins_dims[i];
+      PADDLE_ENFORCE(in_dim == dim,
+                     "All the candidate tensors must have the same size.");
+    }
+    ctx->SetOutputDim("Out", in_dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
+        ctx.device_context());
+  }
+};
+
+class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MultiplexOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Ids", "The index tensor of multiplex operator.");
+    AddInput("X", "The candidate tensors of multiplex operator.")
+        .AsDuplicable();
+    AddOutput("Out", "The output tensor of multiplex operator.");
+    AddComment(R"DOC(
+Multiplex Operator.
+
+Multiplex multiple tensors according to the index provided by the index tensor.
+
+Ids: the index tensor.
+X[0 : N - 1]: the candidate tensors for output (N >= 2).
+For each index i from 0 to batchSize - 1, the output is the i-th row of the
+the (Ids[i])-th tensor.
+
+For i-th row of the output tensor:
+
+$$y[i] = x_{k}[i]$$
+
+where `y` is the output tensor, `x_{k}` is the k-th input tensor,
+and `k = Ids[i]`.
+
+)DOC");
+  }
+};
+
+class MultiplexGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(!ctx->Inputs("X").empty(), "Input(X) should not be null.");
+    PADDLE_ENFORCE(!ctx->Outputs(framework::GradVarName("X")).empty(),
+                   "Output(X@Grad) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<false>);
+REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
+REGISTER_OP_CPU_KERNEL(
+    multiplex,
+    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    multiplex_grad,
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3ef7ef1dfcd04d59573bc6c726fef757f5f2ce23
--- /dev/null
+++ b/paddle/fluid/operators/multiplex_op.cu
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/multiplex_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename Place, typename T>
+class MultiplexGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto ins = ctx.MultiInput<Tensor>("X");
+    auto* ids = ctx.Input<Tensor>("Ids");
+    auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto rows = ins[0]->dims()[0];
+    auto cols = ins[0]->numel() / rows;
+    // copy index to cpu
+    Tensor index_t_cpu;
+    Copy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
+    auto* index = index_t_cpu.data<int32_t>();
+    auto stream = ctx.cuda_device_context().stream();
+    platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    for (auto i = 0; i < rows; i++) {
+      int32_t k = index[i];
+      PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
+      PADDLE_ENFORCE_LT((size_t)k, ins.size(),
+                        "index exceeds the number of candidate tensors.");
+      memory::Copy(place, out->data<T>() + i * cols, place,
+                   ins[k]->data<T>() + i * cols, cols * sizeof(T), stream);
+    }
+  }
+};
+
+template <typename Place, typename T>
+class MultiplexGradGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto ins = ctx.MultiInput<Tensor>("X");
+    auto* ids = ctx.Input<Tensor>("Ids");
+    auto d_ins = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
+    for (size_t i = 0; i < d_ins.size(); i++) {
+      if (d_ins[i]) {
+        d_ins[i]->mutable_data<T>(ctx.GetPlace());
+        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
+        t.device(*ctx.template device_context<Place>().eigen_device()) =
+            t.constant(static_cast<T>(0));
+      }
+    }
+
+    auto rows = ins[0]->dims()[0];
+    auto cols = ins[0]->numel() / rows;
+    // copy index to cpu
+    Tensor index_t_cpu;
+    Copy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
+    auto* index = index_t_cpu.data<int32_t>();
+
+    auto stream = ctx.cuda_device_context().stream();
+    platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    for (auto i = 0; i < rows; i++) {
+      size_t k = static_cast<size_t>(index[i]);
+      if (d_ins[k]) {
+        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
+                     d_out->data<T>() + i * cols, cols * sizeof(T), stream);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    multiplex,
+    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    multiplex_grad,
+    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/multiplex_op.h b/paddle/fluid/operators/multiplex_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..682117cb1b4581560d6b4a615d97e2d18a91ffd6
--- /dev/null
+++ b/paddle/fluid/operators/multiplex_op.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MultiplexCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto ids = ctx.Input<framework::Tensor>("Ids");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto rows = ins[0]->dims()[0];
+    auto cols = ins[0]->numel() / rows;
+    auto index = ids->data<int32_t>();
+    platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
+    for (auto i = 0; i < rows; i++) {
+      int32_t k = index[i];
+      PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
+      PADDLE_ENFORCE_LT(static_cast<size_t>(k), ins.size(),
+                        "index exceeds the number of candidate tensors.");
+      memory::Copy(place, out->data<T>() + i * cols, place,
+                   ins[k]->data<T>() + i * cols, cols * sizeof(T));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MultiplexGradCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* ids = ctx.Input<framework::Tensor>("Ids");
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto d_ins =
+        ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
+    for (size_t i = 0; i < d_ins.size(); i++) {
+      if (d_ins[i]) {
+        d_ins[i]->mutable_data<T>(ctx.GetPlace());
+        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
+        t.device(*ctx.template device_context<DeviceContext>().eigen_device()) =
+            t.constant(static_cast<T>(0));
+      }
+    }
+
+    auto rows = ins[0]->dims()[0];
+    auto cols = ins[0]->numel() / rows;
+    auto* index = ids->data<int32_t>();
+    platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
+    for (auto i = 0; i < rows; i++) {
+      size_t k = static_cast<size_t>(index[i]);
+      if (d_ins[k]) {
+        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
+                     d_out->data<T>() + i * cols, cols * sizeof(T));
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt
similarity index 100%
rename from paddle/operators/nccl/CMakeLists.txt
rename to paddle/fluid/operators/nccl/CMakeLists.txt
diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.cc b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2a8ce932ec51a3d85ef04a2c07e08186929632f2
--- /dev/null
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace platform {}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.h b/paddle/fluid/operators/nccl/nccl_gpu_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e78613239e6c401bad5aa80746000c5b47cd031
--- /dev/null
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/dynload/nccl.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace platform {
+
+constexpr int kInvalidGPUId = -1;
+
+struct Communicator {
+  std::vector<ncclComm_t> comms_;
+  std::unordered_map<int, int> comm_id_map_;
+  bool inited_;
+
+  Communicator() {}
+
+  int GetCommId(int device_id) const { return comm_id_map_.at(device_id); }
+
+  void InitAll(const std::vector<int>& gpus) {
+    comms_.resize(gpus.size());
+    inited_ = false;
+    for (size_t i = 0; i < gpus.size(); ++i) {
+      comm_id_map_[gpus[i]] = i;
+    }
+    PADDLE_ENFORCE(
+        dynload::ncclCommInitAll(comms_.data(), gpus.size(), gpus.data()));
+    inited_ = true;
+  }
+
+  ~Communicator() {
+    if (inited_) {
+      for (size_t i = 0; i < comms_.size(); ++i) {
+        // FIXME(dzh) : PADDLE_ENFORCE return void
+        dynload::ncclCommDestroy(comms_[i]);
+      }
+    }
+  }
+
+  DISABLE_COPY_AND_ASSIGN(Communicator);
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/operators/nccl_op.cc b/paddle/fluid/operators/nccl_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..987b091e572e6dd3273568c73153cd5589e3da3d
--- /dev/null
+++ b/paddle/fluid/operators/nccl_op.cc
@@ -0,0 +1,252 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+
+namespace paddle {
+namespace operators {
+
+static constexpr char kParallelScopes[] = "parallel_scopes";
+
+// NCCLinitOp
+class NCCLInitOp : public framework::OperatorBase {
+ public:
+  NCCLInitOp(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kParallelScopes)),
+                            "Can not find variable '%s' in the scope.",
+                            kParallelScopes);
+    const auto &name = Output("Communicator");
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
+                            "Can not find variable '%s' in the scope.", name);
+    // A parallel do may not use all the gpus. For example, the batch size is 7
+    // in the last batch while we have 8 gpu. In this case, parallel_do will
+    // create 7 parallel scopes, so should ncclInitOp create 7 gpu peers
+    auto &parallel_scopes = scope.FindVar(Input(kParallelScopes))
+                                ->Get<std::vector<framework::Scope *>>();
+    std::vector<int> gpus(parallel_scopes.size());
+    for (int i = 0; i < static_cast<int>(parallel_scopes.size()); ++i) {
+      gpus[i] = i;
+    }
+    PADDLE_ENFORCE(!gpus.empty(), "NCCL init with 0 gpus.");
+
+    if (scope.FindVar(name) == nullptr) {
+      PADDLE_THROW("Output(Communicator) is needed for ncclInit operator.");
+    }
+
+    platform::Communicator *comm =
+        scope.FindVar(name)->GetMutable<platform::Communicator>();
+    comm->InitAll(gpus);
+  }
+};
+
+class NCCLInitOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto out_var_name = op_desc.Output("Communicator").front();
+    auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    auto var_type = framework::proto::VarDesc::NCCL_COM;
+    out_var.SetType(var_type);
+  }
+};
+
+class NCCLInitOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {}
+};
+
+class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NCCLInitOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kParallelScopes, "The working place of parallel do.");
+    AddOutput("Communicator",
+              "Create Communicator for communicating between gpus");
+    AddComment(R"DOC(
+NCCLInit Operator.
+
+Create communicator.
+
+)DOC");
+  }
+};
+
+// AllReduceOp
+class NCCLAllReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   " Input(X) of AllReduce op input should not be NULL");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Communicator"),
+        " Input(Communicator) of AllReduce op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   " Output(Out) of AllReduce op output should not be NULL");
+
+    auto x_dims = ctx->GetInputsDim("X");
+
+    std::string reduction = ctx->Attrs().Get<std::string>("reduction");
+    PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
+                    reduction == "ncclMin" || reduction == "ncclMax"),
+                   "invalid reduction.");
+
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+// ReduceOp
+class NCCLReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   " Input(X) of Reduce op input should not be NULL");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Communicator"),
+        " Input(Communicator) of Reduce op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   " Input(X) of Reduce op input should not be NULL");
+
+    std::string reduction = ctx->Attrs().Get<std::string>("reduction");
+    PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
+                    reduction == "ncclMin" || reduction == "ncclMax"),
+                   "invalid reduction.");
+
+    auto x_dims = ctx->GetInputsDim("X");
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+// BcastOp
+class NCCLBcastOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   " Input(X) of Bcast op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasInput("Communicator"),
+                   " Input(Communicator) of Bcast op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   " Output(Out) of Bcast op output should not be NULL");
+
+    int root = ctx->Attrs().Get<int>("root");
+    PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set.");
+
+    auto x_dims = ctx->GetInputsDim("X");
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+// AllreduceOp
+class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NCCLAllReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of AllReduce op");
+    AddInput("Communicator", "Communicator for communicating between gpus");
+    AddOutput("Out", "The output of AllReduce op");
+    AddAttr<std::string>("reduction",
+                         "(string, default 'ncclSum') "
+                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
+        .SetDefault("ncclSum");
+    AddComment(R"DOC(
+NCCLAllReduce Operator.
+
+AllReduce the input tensors.
+
+)DOC");
+  }
+};
+
+// ReduceOp
+class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NCCLReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of Reduce op");
+    AddInput("Communicator", "Communicator for communicating between gpus");
+    AddOutput("Out", "The output of Reduce op");
+    AddAttr<std::string>("reduction",
+                         "(string, default 'ncclSum') "
+                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
+        .SetDefault("ncclSum");
+    AddAttr<int>("root",
+                 "(int, default kInvalidGPUId) "
+                 "Root gpu of the parameter. If not, "
+                 "set(platform::kInvalidGPUId). Hashed by name.")
+        .SetDefault(platform::kInvalidGPUId);
+    AddComment(R"DOC(
+NCCLReduce Operator.
+
+Reduce the tensors.
+
+)DOC");
+  }
+};
+
+// BcastOp
+class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NCCLBcastOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of BcastSend op");
+    AddInput("Communicator", "Communicator for communicating between gpus");
+    AddOutput("Out", "The output of Bcast");
+    AddAttr<int>("root",
+                 "(int, default kInvalidGPUId) "
+                 "Root gpu of the parameter. If not, "
+                 "set(platform::kInvalidGPUId). Hashed by name.")
+        .SetDefault(platform::kInvalidGPUId);
+    AddComment(R"DOC(
+NCCLBcast Operator.
+
+Bcast the tensors.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(ncclInit, ops::NCCLInitOp,
+                  paddle::framework::EmptyGradOpMaker, ops::NCCLInitOpMaker,
+                  ops::NCCLInitOpVarTypeInference,
+                  ops::NCCLInitOpShapeInference);
+
+REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp,
+                             ops::NCCLAllReduceOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(ncclBcast, ops::NCCLBcastOp,
+                             ops::NCCLBcastOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(ncclReduce, ops::NCCLReduceOp,
+                             ops::NCCLReduceOpMaker);
diff --git a/paddle/fluid/operators/nccl_op.cu.cc b/paddle/fluid/operators/nccl_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7637c7ed163f64502fcbd1dae12d373067ce5787
--- /dev/null
+++ b/paddle/fluid/operators/nccl_op.cu.cc
@@ -0,0 +1,217 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenseshashernless required by applicable law or agreed
+to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <functional>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using platform::Communicator;
+using framework::LoDTensor;
+
+template <typename Type>
+class NCCLTypeWrapper;
+
+template <>
+class NCCLTypeWrapper<float> {
+ public:
+  static const ncclDataType_t type = ncclFloat;
+};
+
+template <>
+class NCCLTypeWrapper<double> {
+ public:
+  static const ncclDataType_t type = ncclDouble;
+};
+
+template <typename T>
+class NCCLAllReduceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+
+    auto ins = ctx.MultiInput<LoDTensor>("X");
+    auto outs = ctx.MultiOutput<LoDTensor>("Out");
+
+    LOG(INFO) << "------------------";
+    std::string reduction = ctx.Attr<std::string>("reduction");
+    LOG(INFO) << "------------------";
+    ncclRedOp_t reduction_op_ = ncclSum;
+    LOG(INFO) << "------------------";
+
+    if (reduction == "ncclMin") {
+      reduction_op_ = ncclMin;
+    } else if (reduction == "ncclMax") {
+      reduction_op_ = ncclMax;
+    } else if (reduction == "ncclSum") {
+      reduction_op_ = ncclSum;
+    } else if (reduction == "ncclProd") {
+      reduction_op_ = ncclProd;
+    } else {
+      PADDLE_THROW("Invalid reduction. default ncclSum.");
+    }
+
+    LOG(INFO) << "------------------";
+    auto* comm = ctx.Input<Communicator>("Communicator");
+
+    LOG(INFO) << "------------------";
+    auto stream = ctx.cuda_device_context().stream();
+
+    LOG(INFO) << "------------------";
+    // device id
+    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
+    LOG(INFO) << "------------------";
+    int idx = comm->GetCommId(gpu_id);
+
+    LOG(INFO) << "------------------";
+    for (size_t i = 0; i < ins.size(); ++i) {
+      VLOG(1) << "gpu : "
+              << " invoke allreduce. send " << ins[i]->numel() << " recv "
+              << outs[i]->numel();
+
+      PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+          ins[i]->data<T>(), outs[i]->mutable_data<T>(ctx.GetPlace()),
+          outs[i]->numel(), NCCLTypeWrapper<T>::type, reduction_op_,
+          comm->comms_[idx], stream));
+      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+
+      VLOG(1) << "gpu : "
+              << " finished allreduce. send " << ins[i]->numel() << " recv "
+              << outs[i]->numel();
+    }
+  }
+};
+
+template <typename T>
+class NCCLReduceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+
+    auto ins = ctx.MultiInput<LoDTensor>("X");  // x0, x1, x2
+    auto outs = ctx.MultiOutput<LoDTensor>("Out");
+
+    std::string reduction = ctx.Attr<std::string>("reduction");
+    ncclRedOp_t reduction_op_ = ncclSum;
+
+    if (reduction == "ncclMin") {
+      reduction_op_ = ncclMin;
+    } else if (reduction == "ncclMax") {
+      reduction_op_ = ncclMax;
+    } else if (reduction == "ncclSum") {
+      reduction_op_ = ncclSum;
+    } else if (reduction == "ncclProd") {
+      reduction_op_ = ncclProd;
+    } else {
+      PADDLE_THROW("Invalid reduction. default ncclSum.");
+    }
+
+    int root = ctx.Attr<int>("root");
+    auto* comm = ctx.Input<Communicator>("Communicator");
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+    // device id
+    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
+    int idx = comm->GetCommId(gpu_id);
+
+    auto ins_names = ctx.Inputs("X");
+    std::hash<std::string> hasher;
+    for (size_t i = 0; i < ins.size(); ++i) {
+      if (root == platform::kInvalidGPUId) {
+        root = hasher(ins_names[i]) % comm->comms_.size();
+      }
+      T* recvbuffer = nullptr;
+      if (root == gpu_id) {
+        recvbuffer = outs[i]->mutable_data<T>(ctx.GetPlace());
+      }
+
+      VLOG(1) << "gpu : " << gpu_id << " invoke reduce. send "
+              << ins[i]->numel() << " recv " << outs[i]->numel();
+
+      PADDLE_ENFORCE(platform::dynload::ncclReduce(
+          ins[i]->data<T>(), recvbuffer, ins[i]->numel(),
+          NCCLTypeWrapper<T>::type, reduction_op_, root, comm->comms_[idx],
+          stream));
+      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+
+      VLOG(1) << "gpu : " << gpu_id << " finished reduce. send "
+              << ins[i]->numel() << " recv " << outs[i]->numel();
+    }
+  }
+};
+
+template <typename T>
+class NCCLBcastKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+
+    int root = ctx.Attr<int>("root");
+
+    auto* comm = ctx.Input<Communicator>("Communicator");
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+    // device id
+    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
+    int idx = comm->GetCommId(gpu_id);
+
+    if (idx == root) {
+      auto ins = ctx.MultiInput<LoDTensor>("X");
+      for (size_t i = 0; i < ins.size(); ++i) {
+        VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. send "
+                << ins[i]->numel();
+
+        VLOG(1) << " before ncclBcast";
+        PADDLE_ENFORCE(platform::dynload::ncclBcast(
+            (void*)ins[i]->data<T>(), ins[i]->numel(), NCCLTypeWrapper<T>::type,
+            root, comm->comms_[idx], stream));
+        VLOG(1) << " after ncclBcast";
+        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+
+        VLOG(1) << "gpu : " << gpu_id << " finished Bcast.";
+      }
+    } else {
+      auto outs = ctx.MultiOutput<LoDTensor>("Out");
+      for (size_t i = 0; i < outs.size(); ++i) {
+        VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
+                << framework::product(outs[i]->dims());
+
+        PADDLE_ENFORCE(platform::dynload::ncclBcast(
+            outs[i]->mutable_data<T>(ctx.GetPlace()), outs[i]->numel(),
+            NCCLTypeWrapper<T>::type, root, comm->comms_[idx], stream));
+        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+
+        VLOG(1) << "gpu : " << gpu_id << " finished Bcast. recv "
+                << outs[i]->numel();
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
+REGISTER_OP_CUDA_KERNEL(ncclBcast, ops::NCCLBcastKernel<float>);
+REGISTER_OP_CUDA_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);
diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..212ed2f9b63de5061dc3a2eb86508d8a4c305f89
--- /dev/null
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
@@ -0,0 +1,318 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
+
+USE_NO_KERNEL_OP(ncclInit);
+USE_CUDA_ONLY_OP(ncclAllReduce);
+USE_CUDA_ONLY_OP(ncclReduce);
+USE_CUDA_ONLY_OP(ncclBcast);
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+static std::vector<int> gpu_list;
+
+// test data amount
+const f::DDim kDims = {100, 100};
+
+// nccl op common tester, init communicator.
+class NCCLTester : public ::testing::Test {
+ public:
+  virtual void SetUp() override {
+    paddle::platform::CPUPlace cpu_place;
+    for (size_t i = 0; i < gpu_list.size(); ++i) {
+      p::CUDAPlace place(i);
+      dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
+    }
+
+    NCCLInitOp();
+  }
+
+  virtual void TearDown() override {
+    for (auto &device_context : dev_ctxs) {
+      delete device_context;
+    }
+  }
+
+  void NCCLInitOp() {
+    paddle::platform::CPUPlace cpu_place;
+    std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
+
+    op1->SetType("ncclInit");
+    op1->SetOutput("Communicator", {"comm"});
+    op1->SetAttr("gpus", {gpu_list});
+
+    auto *var = g_scope.Var("comm");
+    var->GetMutable<p::Communicator>();
+
+    auto op = f::OpRegistry::CreateOp(*op1);
+    VLOG(1) << "invoke NCCLInitOp.";
+    op->Run(g_scope, cpu_place);
+    VLOG(1) << "NCCLInitOp finished.";
+  }
+
+  template <class T>
+  void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
+    std::unique_lock<std::mutex> lk(mu);
+    const f::OpDesc *op1 = &op_desc;
+
+    p::CUDAPlace place(gpu_id);
+    auto &ctx = dev_ctxs.at(gpu_id);
+
+    auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
+    auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>();
+
+    if (!send_tensor->numel()) {
+      send_tensor->Resize(kDims);
+      send_tensor->mutable_data<T>(kDims, place);
+
+      std::vector<T> send_vector(f::product(kDims), gpu_id);
+      paddle::framework::CopyFromVector<T>(send_vector, *ctx, send_tensor);
+      ctx->Wait();
+      VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
+    }
+
+    lk.unlock();
+
+    PADDLE_ENFORCE(send_tensor->numel() == f::product(kDims),
+                   "Tensor numel not match!");
+
+    auto op = f::OpRegistry::CreateOp(*op1);
+
+    VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
+    VLOG(1) << " send_tensor : " << send_tensor->numel()
+            << " recv_tensor : " << recv_tensor->numel();
+    op->Run(*scope, place);
+    VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
+  }
+
+ public:
+  std::vector<p::DeviceContext *> dev_ctxs;
+  f::Scope g_scope;
+  std::mutex mu;
+};
+
+// ncclInitOp with desc
+TEST(NCCL, ncclInitOp) {
+  std::unique_ptr<f::OpDesc> op_desc(new f::OpDesc);
+
+  op_desc->SetType("ncclInit");
+  op_desc->SetOutput("Communicator", {"x1"});
+  op_desc->SetAttr("gpus", {gpu_list});
+
+  f::Scope g_scope;
+  paddle::platform::CPUPlace cpu_place;
+
+  auto *var = g_scope.Var("x1");
+  var->GetMutable<p::Communicator>();
+
+  auto op = f::OpRegistry::CreateOp(*op_desc);
+  VLOG(1) << "invoke NCCLInitOp.";
+  op->Run(g_scope, cpu_place);
+  VLOG(1) << "NCCLInitOp finished.";
+}
+
+// ncclAllReduceOp with desc
+TEST_F(NCCLTester, ncclAllReduceOp) {
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
+  op2->SetType("ncclAllReduce");
+  op2->SetInput("X", {"st"});
+  op2->SetInput("Communicator", {"comm"});
+  op2->SetOutput("Out", {"rt"});
+
+  std::vector<f::Scope *> dev_scopes;
+
+  std::vector<std::thread> ths;
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    dev_scopes.emplace_back(&g_scope.NewScope());
+    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+                   *op2.get(), dev_scopes[i]);
+    ths.emplace_back(std::move(th));
+  }
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    ths[i].join();
+  }
+
+  // check results
+  float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
+
+  for (size_t i = 0; i < dev_scopes.size(); ++i) {
+    p::CPUPlace cpu_place;
+    p::CUDAPlace gpu_place(gpu_list[i]);
+
+    auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
+    auto *rt = recv_tensor.data<float>();
+    auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
+    result_tensor->Resize(kDims);
+    auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+    paddle::memory::Copy(
+        cpu_place, ct, p::CUDAPlace(gpu_list[i]), rt,
+        recv_tensor.numel() * sizeof(float),
+        static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
+
+    for (int64_t j = 0; j < f::product(kDims); ++j) {
+      ASSERT_NEAR(ct[j], result, 1e-5);
+    }
+  }
+}
+
+// ncclReduceOp with desc
+TEST_F(NCCLTester, ncclReduceOp) {
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
+  const int kRoot = 0;
+  op2->SetType("ncclReduce");
+  op2->SetInput("X", {"st"});
+  op2->SetInput("Communicator", {"comm"});
+  op2->SetOutput("Out", {"rt"});
+  op2->SetAttr("root", kRoot);
+
+  std::vector<f::Scope *> dev_scopes;
+
+  std::vector<std::thread> ths;
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    dev_scopes.emplace_back(&g_scope.NewScope());
+    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+                   *op2.get(), dev_scopes[i]);
+    ths.emplace_back(std::move(th));
+  }
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    ths[i].join();
+  }
+
+  // check results on
+  float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
+
+  p::CPUPlace cpu_place;
+  p::CUDAPlace gpu_place(gpu_list[kRoot]);
+
+  auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
+  auto *rt = recv_tensor.data<float>();
+  auto *result_tensor =
+      dev_scopes[kRoot]->Var("ct")->GetMutable<f::LoDTensor>();
+  result_tensor->Resize(kDims);
+  auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+  paddle::memory::Copy(
+      cpu_place, ct, p::CUDAPlace(gpu_list[kRoot]), rt,
+      recv_tensor.numel() * sizeof(float),
+      static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
+
+  for (int64_t j = 0; j < f::product(kDims); ++j) {
+    ASSERT_NEAR(ct[j], result, 1e-5);
+  }
+}
+
+// ncclBcastOp with desc
+TEST_F(NCCLTester, ncclBcastOp) {
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
+  const int kRoot = 0;
+  op2->SetType("ncclBcast");
+  op2->SetInput("X", {"st"});
+  op2->SetInput("Communicator", {"comm"});
+  op2->SetOutput("Out", {"rt"});
+  op2->SetAttr("root", kRoot);
+
+  std::vector<f::Scope *> dev_scopes;
+
+  std::vector<std::thread> ths;
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    dev_scopes.emplace_back(&g_scope.NewScope());
+    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+                   *op2.get(), dev_scopes[i]);
+    ths.emplace_back(std::move(th));
+  }
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    ths[i].join();
+  }
+
+  const int idx = 1;
+  // check results on
+  float result = kRoot;
+
+  p::CPUPlace cpu_place;
+  p::CUDAPlace gpu_place(gpu_list[idx]);
+
+  auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
+  auto *rt = recv_tensor.data<float>();
+  auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
+  result_tensor->Resize(kDims);
+  auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+  paddle::memory::Copy(
+      cpu_place, ct, p::CUDAPlace(gpu_list[idx]), rt,
+      recv_tensor.numel() * sizeof(float),
+      static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
+
+  for (int64_t j = 0; j < f::product(kDims); ++j) {
+    ASSERT_NEAR(ct[j], result, 1e-5);
+  }
+}
+
+int main(int argc, char **argv) {
+  // FIXME(tonyyang-svail):
+  //   Due to the driver issue on our CI, disable for now
+  return 0;
+  const int dev_count = p::GetCUDADeviceCount();
+  if (dev_count <= 1) {
+    LOG(WARNING)
+        << "Cannot test multi-gpu nccl, because the CUDA device count is "
+        << dev_count;
+    return 0;
+  }
+
+  std::vector<paddle::platform::Place> places;
+
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::CUDAPlace(i));
+    gpu_list.emplace_back(i);
+  }
+
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Init(places);
+
+  testing::InitGoogleTest(&argc, argv);
+
+  // device context should be release before scope.
+  // otherwise driver will down.
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0841313a1042ea4099473fdc0293d9bae4a7c8c3
--- /dev/null
+++ b/paddle/fluid/operators/nce_op.cc
@@ -0,0 +1,187 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/nce_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class NCEOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"));
+    PADDLE_ENFORCE(ctx->HasInput("Label"));
+    PADDLE_ENFORCE(ctx->HasInput("Weight"));
+    PADDLE_ENFORCE(ctx->HasOutput("Cost"));
+    PADDLE_ENFORCE(ctx->HasOutput("SampleLogits"));
+    PADDLE_ENFORCE(ctx->HasOutput("SampleLabels"));
+
+    auto x_dims = ctx->GetInputDim("Input");
+    auto label_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]);
+    int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1;
+    if (ctx->HasInput("Bias")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Weight")[0],
+                        ctx->GetInputDim("Bias")[0]);
+    }
+    auto num_neg_samples = ctx->Attrs().Get<int>("num_neg_samples");
+    auto num_total_classes = ctx->Attrs().Get<int>("num_total_classes");
+    std::vector<int> custom_neg_classes =
+        ctx->Attrs().Get<std::vector<int>>("custom_neg_classes");
+    PADDLE_ENFORCE_EQ(num_total_classes, ctx->GetInputDim("Weight")[0]);
+    if (custom_neg_classes.size() > 0) {
+      PADDLE_ENFORCE_EQ(custom_neg_classes.size(),
+                        static_cast<size_t>(num_neg_samples));
+    }
+    // set dims of output(Out)
+    std::vector<int64_t> out_dims;
+    out_dims.push_back(x_dims[0]);
+    out_dims.push_back(1);
+    ctx->SetOutputDim("Cost", framework::make_ddim(out_dims));
+
+    // set dims of output(SampleOut)
+    std::vector<int64_t> sample_out_dims;
+    sample_out_dims.push_back(x_dims[0]);
+    sample_out_dims.push_back(num_neg_samples + num_true_classes);
+    ctx->SetOutputDim("SampleLogits", framework::make_ddim(sample_out_dims));
+    ctx->SetOutputDim("SampleLabels", framework::make_ddim(sample_out_dims));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
+        ctx.GetPlace());
+  }
+};
+
+class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NCEOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input", "(Tensor) A tensor of shape [batch_size, dim].");
+    AddInput(
+        "Label",
+        "(Tensor) A tensor of shape [batch_size, num_true_class]. "
+        "'num_true_class' is the number of target classes in each sample."
+        "The number of target classes per sample should be same. "
+        "If you have a variable number of target classes, "
+        "you can pad them out to a constant number by either repeating them"
+        " or by padding with an otherwise unused class.)");
+    AddInput("Weight",
+             "(Tensor) A tensor of shape [num_class, dim]. 'num_class' is the "
+             "total number of class.");
+    AddInput(
+        "Bias",
+        "(Tensor) A tensor of shape [num_class, 1]. 'num_class' is the total "
+        "number of class. It is a dispensable input.")
+        .AsDispensable();
+    AddInput("SampleWeight",
+             "(Tensor) A tensor of shape [batch_size, 1] storing a weight for "
+             "each sample. And it is a dispensable input. The default value of "
+             "sample is 1.")
+        .AsDispensable();
+    AddOutput("Cost",
+              "(Tensor) A tensor of shape [batch_size, 1]. Cost of samples.");
+    AddOutput("SampleLogits",
+              "An intermediate tensor of shape[batch_size, num_neg_samples + "
+              "num_pos_samples]."
+              "This tensor is output of forward kernel and used in backward "
+              "kernel to compute grads."
+              "Given X is  the dot product of input tensor and sampled labels' "
+              "weights."
+              "Then 'SampleLogits' is sigmoid(X).")
+        .AsIntermediate();
+    AddOutput("SampleLabels",
+              "An intermediate tensor of shape[batch_size, num_neg_samples + "
+              "num_pos_samples]."
+              "This tensor is output of forward kernel and used in backward "
+              "kernel to compute grads."
+              "")
+        .AsIntermediate();
+    AddAttr<int>("num_total_classes",
+                 "Total number of classes in all samples.");
+    AddAttr<int>("num_neg_samples",
+                 "The number of negative classes. The default value is 10.")
+        .SetDefault(10);
+    AddAttr<std::vector<int>>("custom_neg_classes",
+                              "This attribute only be used in unitest. Classes "
+                              "in this list wiil be used as negative classes "
+                              "for every samples. Under normal conditions, "
+                              "user should avoid setting this attribute.")
+        .SetDefault({});
+    AddComment(R"DOC(
+Compute and return the noise-contrastive estimation training loss.
+See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+By default this operator uses a uniform distribution for sampling.
+)DOC");
+  }
+};
+
+class NCEOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"));
+    PADDLE_ENFORCE(ctx->HasInput("Weight"));
+    PADDLE_ENFORCE(ctx->HasInput("Cost"));
+    PADDLE_ENFORCE(ctx->HasInput("SampleLogits"));
+    PADDLE_ENFORCE(ctx->HasInput("SampleLabels"));
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Cost")),
+                   "The input(Out@GRAD) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("Input");
+    auto x_grad_name = framework::GradVarName("Input");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+
+    auto w_dims = ctx->GetInputDim("Weight");
+    auto w_grad_name = framework::GradVarName("Weight");
+    if (ctx->HasOutput(w_grad_name)) {
+      ctx->SetOutputDim(w_grad_name, w_dims);
+    }
+
+    auto bias_grad_name = framework::GradVarName("Bias");
+    if (ctx->HasOutput(bias_grad_name)) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
+        ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(nce, ops::NCEOp, ops::NCEOpMaker, nce_grad, ops::NCEOpGrad);
+REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel<paddle::platform::CPUPlace, float>,
+                       ops::NCEKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(nce_grad,
+                       ops::NCEGradKernel<paddle::platform::CPUPlace, float>,
+                       ops::NCEGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..624c2d9bbd3245a11c8cfff2dd7cae6e5b25f106
--- /dev/null
+++ b/paddle/fluid/operators/nce_op.h
@@ -0,0 +1,212 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <math.h>
+#include <random>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+void PrepareSamples(const framework::ExecutionContext& context) {
+  auto label = context.Input<Tensor>("Label");
+  const int64_t* label_data = label->data<int64_t>();
+  auto label_dims = label->dims();
+  int num_total_classes = context.Attr<int>("num_total_classes");
+  // for unitest
+  std::vector<int> custom_neg_classes =
+      context.Attr<std::vector<int>>("custom_neg_classes");
+  // random machine
+  std::random_device rd;
+  std::mt19937 rng(rd());
+  std::uniform_int_distribution<int> rand(0, num_total_classes - 1);
+
+  auto sample_labels = context.Output<Tensor>("SampleLabels");
+  auto sample_labels_dims = sample_labels->dims();
+  int64_t* sample_labels_data =
+      sample_labels->mutable_data<int64_t>(context.GetPlace());
+
+  int num_label = label_dims.size() == 2 ? label_dims[1] : 1;
+  int index = 0;
+  for (int64_t i = 0; i < label_dims[0]; ++i) {
+    int j = 0;
+    for (; j < num_label; ++j) {
+      sample_labels_data[index++] = label_data[i * num_label + j];
+    }
+    if (custom_neg_classes.size() > 0) {
+      for (auto label : custom_neg_classes) {
+        sample_labels_data[index++] = label;
+      }
+    } else {
+      for (; j < sample_labels_dims[1]; ++j) {
+        // TODO(wanghaoshuang): support more distribution sampling
+        sample_labels_data[index++] = rand(rng);
+      }
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class NCEKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PrepareSamples<DeviceContext, T>(context);
+    auto sample_labels = context.Output<Tensor>("SampleLabels");
+    const int64_t* sample_labels_data = sample_labels->data<int64_t>();
+    auto sample_out = context.Output<Tensor>("SampleLogits");
+    T* sample_out_data = sample_out->mutable_data<T>(context.GetPlace());
+    auto label = context.Input<Tensor>("Label");
+    auto sample_weight = context.Input<Tensor>("SampleWeight");
+    const T* sample_weight_data = nullptr;
+    if (sample_weight != nullptr) {
+      sample_weight_data = sample_weight->data<T>();
+    }
+    auto out = context.Output<Tensor>("Cost");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    int num_neg_samples = context.Attr<int>("num_neg_samples");
+    int num_total_classes = context.Attr<int>("num_total_classes");
+    int64_t num_true_class = 1;
+    if (label != nullptr) {
+      num_true_class = label->dims()[1];
+    }
+    T b = 1. / num_total_classes * num_neg_samples;
+    // forward bias
+    auto bias = context.Input<Tensor>("Bias");
+    if (bias != nullptr) {
+      const T* bias_data = bias->data<T>();
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        sample_out_data[i] = bias_data[sample_labels_data[i]];
+      }
+    } else {
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        sample_out_data[i] = 0;
+      }
+    }
+    // forward mul
+    auto input_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
+    auto weight_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
+    for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+      Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
+          (input_mat.chip((int)(i / sample_labels->dims()[1]), 0) *
+           weight_mat.chip(sample_labels_data[i], 0))
+              .sum();
+      sample_out_data[i] += result(0);
+      sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
+    }
+    // forward cost
+    for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) {
+      int64_t j = 0;
+      out_data[i] = 0;
+      T w = sample_weight == nullptr ? 1. : sample_weight_data[i];
+      // for true classes
+      for (; j < num_true_class; ++j) {
+        T o = sample_out_data[i * sample_out->dims()[1] + j];
+        T cost = -log(o / (o + b));
+        out_data[i] += w * cost;
+      }
+      // for sampled neg classes
+      for (; j < sample_labels->dims()[1]; ++j) {
+        T o = sample_out_data[i * sample_out->dims()[1] + j];
+        T cost = -log(b / (o + b));
+        out_data[i] += w * cost;
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class NCEGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto d_out = context.Input<Tensor>(framework::GradVarName("Cost"));
+    const T* d_out_data = d_out->data<T>();
+    auto label = context.Input<Tensor>("Label");
+    auto sample_out = context.Input<Tensor>("SampleLogits");
+    const T* sample_out_data = sample_out->data<T>();
+    auto sample_labels = context.Input<Tensor>("SampleLabels");
+    const int64_t* sample_labels_data = sample_labels->data<int64_t>();
+    auto sample_weight = context.Input<Tensor>("SampleWeight");
+    const T* sample_weight_data = nullptr;
+    if (sample_weight != nullptr) {
+      sample_weight_data = sample_weight->data<T>();
+    }
+    int num_neg_samples = context.Attr<int>("num_neg_samples");
+    int num_total_classes = context.Attr<int>("num_total_classes");
+    int num_true_class = 1;
+    if (label != nullptr) {
+      num_true_class = label->dims()[1];
+    }
+    T b = 1. / num_total_classes * num_neg_samples;
+    Tensor sample_grad;  // tmp tensor
+    T* sample_grad_data =
+        sample_grad.mutable_data<T>(sample_labels->dims(), context.GetPlace());
+    // backward cost
+    for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+      T o = sample_out_data[i];
+      T w = sample_weight == nullptr
+                ? 1
+                : sample_weight_data[i / sample_labels->dims()[1]];
+      sample_grad_data[i] = (i % sample_labels->dims()[1]) < num_true_class
+                                ? w * (b / (o + b)) * (o - 1)
+                                : w * (o * (1 - o) / (o + b));
+      sample_grad_data[i] *= d_out_data[i / sample_labels->dims()[1]];
+    }
+    // get d_bias
+    auto d_bias = context.Output<Tensor>(framework::GradVarName("Bias"));
+    if (d_bias != nullptr) {
+      T* d_bias_data = d_bias->mutable_data<T>(context.GetPlace());
+      std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0);
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        d_bias_data[sample_labels_data[i]] += sample_grad_data[i];
+      }
+    }
+    // get d_w
+    auto d_w = context.Output<Tensor>(framework::GradVarName("Weight"));
+    if (d_w != nullptr) {
+      auto d_w_data = d_w->mutable_data<T>(context.GetPlace());
+      std::fill(d_w_data, d_w_data + d_w->numel(), 0.0);
+      auto d_w_matrix = EigenMatrix<T>::From(*d_w);
+      auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        d_w_matrix.chip(sample_labels_data[i], 0) +=
+            x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) *
+            sample_grad_data[i];
+      }
+    }
+    // get d_x
+    auto d_x = context.Output<Tensor>(framework::GradVarName("Input"));
+    if (d_x != nullptr) {
+      auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
+      std::fill(d_x_data, d_x_data + d_x->numel(), 0.0);
+      auto d_x_matrix = EigenMatrix<T>::From(*d_x);
+      auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        d_x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) +=
+            w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i];
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/net_op.cc b/paddle/fluid/operators/net_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c0ca5873adcc92a39f20a162796e7581ea10c63f
--- /dev/null
+++ b/paddle/fluid/operators/net_op.cc
@@ -0,0 +1,103 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/net_op.h"
+#include <set>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+const char NetOp::kAll[] = "all";
+
+void NetOp::CompleteAddOp(bool calc) {
+  add_op_done_ = true;
+  if (!calc) return;
+  std::set<std::string> input_set;
+  std::set<std::string> output_set;
+  for (auto& op : ops_) {
+    for (auto& ipt : op->Inputs()) {
+      for (auto& var_name : ipt.second) {
+        // If input variable has been in output set, then it will be
+        // added into intermediate_outputs_. Otherwise, it will be
+        // added into input set.
+        if (Contains(output_set, var_name)) {
+          intermediate_outputs_.insert(var_name);
+        } else {
+          input_set.insert(var_name);
+        }
+      }
+    }
+
+    for (auto& opt : op->Outputs()) {
+      for (auto& var_name : opt.second) {
+        output_set.insert(var_name);
+      }
+    }
+  }
+  auto& inputs = inputs_[kAll];
+  inputs.reserve(input_set.size());
+  std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs));
+  auto& outputs = outputs_[kAll];
+  outputs.reserve(output_set.size());
+  std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs));
+}
+
+std::string NetOp::DebugStringEx(const framework::Scope* scope) const {
+  std::ostringstream os;
+  os << OperatorBase::DebugStringEx(scope) << std::endl;
+  for (auto& op : ops_) {
+    std::istringstream is(op->DebugStringEx(scope));
+    for (std::string line; std::getline(is, line);) {
+      os << "    " << line << std::endl;
+    }
+  }
+  return os.str();
+}
+
+bool NetOp::IsNetOp() const { return true; }
+
+std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const {
+  std::vector<std::string> all;
+  for (auto& pair : this->outputs_) {
+    for (auto& var_name : pair.second) {
+      all.push_back(var_name);
+    }
+  }
+  if (has_intermediate) {
+    return all;
+  }
+  std::vector<std::string> ret_val;
+  for (auto& each : all) {
+    if (!Contains(intermediate_outputs_, each)) {
+      ret_val.push_back(each);
+    }
+  }
+  return ret_val;
+}
+
+NetOp::NetOp(const std::string& type, const framework::VariableNameMap& inputs,
+             const framework::VariableNameMap& outputs,
+             const framework::AttributeMap& attrs)
+    : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+std::unique_ptr<framework::OperatorBase> NetOp::Clone() const {
+  PADDLE_ENFORCE(
+      add_op_done_,
+      "Must clone a sealed NetOp, invoke Net::CompleteAddOp before clone");
+  return std::unique_ptr<OperatorBase>(new NetOp(*this));
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/net_op.h b/paddle/fluid/operators/net_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..14e5909851c4ac08b5f59c5c193c801827b91234
--- /dev/null
+++ b/paddle/fluid/operators/net_op.h
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <set>
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+/**
+ * @brief Network is also a type of Operator
+ *
+ * It will manage the operators it has.
+ *
+ * Network is the container and controller of a set of operators.
+
+ * A network object knows all Operators belonging to this network. Variables,
+ * which are inputs and outputs of these operators, are created and managed by a
+ * hierarchy of Scope objects.
+ *
+ * This is the base class of network, all the networks should implement the APIs
+ * it defines.
+ */
+class NetOp : public framework::OperatorBase {
+ public:
+  static const char kAll[];
+  NetOp()
+      : framework::OperatorBase("plain_net", framework::VariableNameMap{},
+                                framework::VariableNameMap{},
+                                framework::AttributeMap{}) {}
+
+  NetOp(const std::string& type, const framework::VariableNameMap& inputs,
+        const framework::VariableNameMap& outputs,
+        const framework::AttributeMap& attrs);
+
+  NetOp(const NetOp& o) : framework::OperatorBase(o.type_, {}, {}, o.attrs_) {
+    this->ops_.reserve(o.ops_.size());
+    std::transform(
+        o.ops_.begin(), o.ops_.end(), std::back_inserter(this->ops_),
+        [](const std::unique_ptr<framework::OperatorBase>& op) {
+          return std::unique_ptr<framework::OperatorBase>(op->Clone());
+        });
+    this->CompleteAddOp();
+  }
+
+  /**
+   * @brief Run the network.
+   *
+   * Run all the operators with the `scope`, if no scope is provided, default
+   * scope will be used instead. If no OpContext is provicded, default context
+   * will be used.
+   */
+  void Run(const framework::Scope& scope,
+           const platform::Place& place) const override {
+    for (auto& op : ops_) {
+      op->Run(scope, place);
+    }
+  }
+
+  bool SupportGPU() const override {
+    for (auto& op : ops_) {
+      if (!op->SupportGPU()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void AppendOp(const framework::OperatorBase& op) { AppendOp(op.Clone()); }
+
+  /**
+   * @brief Add an operator by ptr
+   */
+  void AppendOp(std::unique_ptr<framework::OperatorBase> op) {
+    PADDLE_ENFORCE(!add_op_done_,
+                   "Cannot AppendOp when this network is sealed");
+    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
+    ops_.push_back(std::move(op));
+  }
+
+  void InsertOp(size_t pos, std::unique_ptr<framework::OperatorBase> op) {
+    PADDLE_ENFORCE(!add_op_done_,
+                   "Cannot InsertOp when this network is sealed");
+    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
+    PADDLE_ENFORCE_LE(pos, ops_.size(), "Out of range");
+    ops_.insert(ops_.begin() + pos, std::move(op));
+  }
+
+  void InsertOp(size_t pos, const framework::OperatorBase& op) {
+    InsertOp(pos, op.Clone());
+  }
+
+  void CompleteAddOp(bool calculate = true);
+
+  std::string DebugStringEx(
+      const framework::Scope* scope = nullptr) const override;
+
+  bool IsNetOp() const override;
+  std::vector<std::string> OutputVars(bool has_intermediate) const override;
+
+  std::unique_ptr<framework::OperatorBase> Clone() const override;
+
+  std::vector<std::unique_ptr<framework::OperatorBase>> ops_;
+
+ private:
+  bool add_op_done_{false};
+  std::set<std::string> intermediate_outputs_;
+
+  template <typename T, typename KeyType>
+  static bool Contains(T container, KeyType key) {
+    return container.find(key) != container.end();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/net_op_test.cc b/paddle/fluid/operators/net_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cc20be0c81763abe2adcf09de858ce51e16d77a6
--- /dev/null
+++ b/paddle/fluid/operators/net_op_test.cc
@@ -0,0 +1,100 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/net_op.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace operators {
+using Scope = framework::Scope;
+using DeviceContext = platform::DeviceContext;
+
+static int run_cnt = 0;
+
+class TestOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  DEFINE_OP_CLONE_METHOD(TestOp);
+  void Run(const Scope& scope, const platform::Place& place) const override {
+    ++run_cnt;
+  }
+};
+
+template <typename T>
+void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
+                                  const std::vector<T>& actual) {
+  ASSERT_EQ(expected.size(), actual.size());
+  std::unordered_set<T> expected_set;
+  for (auto& tmp : expected) {
+    expected_set.insert(tmp);
+  }
+  for (auto& act : actual) {
+    ASSERT_NE(expected_set.end(), expected_set.find(act));
+  }
+}
+
+TEST(OpKernel, all) {
+  auto net = std::make_shared<NetOp>();
+  ASSERT_NE(net, nullptr);
+
+  net->AppendOp(std::unique_ptr<TestOp>(
+      new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
+                 {{"Out", {"y"}}}, framework::AttributeMap{})));
+  net->AppendOp(std::unique_ptr<TestOp>(
+      new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}},
+                 {{"Out", {"z"}}}, framework::AttributeMap{})));
+
+  net->CompleteAddOp();
+  AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"},
+                               net->Inputs(NetOp::kAll));
+  AssertSameVectorWithoutOrder({"y", "z"}, net->Outputs(NetOp::kAll));
+
+  auto final_outs = net->OutputVars(false);
+
+  ASSERT_EQ(final_outs.size(), 1UL);
+  ASSERT_EQ(final_outs[0], "z");
+}
+
+TEST(NetOp, insert_op) {
+  NetOp net;
+  auto op1 = std::unique_ptr<framework::NOP>(
+      new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
+                         {{"Out", {"y"}}}, framework::AttributeMap{}));
+  net.AppendOp(*op1);
+  net.InsertOp(0, *op1);
+  ASSERT_EQ(2UL, net.ops_.size());
+  net.InsertOp(2, std::move(op1));
+  ASSERT_EQ(3UL, net.ops_.size());
+}
+
+TEST(NetOp, Clone) {
+  NetOp net;
+  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
+      "empty", framework::VariableNameMap{}, framework::VariableNameMap{},
+      framework::AttributeMap{}}));
+  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
+      "empty2", framework::VariableNameMap{}, framework::VariableNameMap{},
+      framework::AttributeMap{}}));
+  net.CompleteAddOp(true);
+  auto new_net_op = net.Clone();
+  ASSERT_NE(new_net_op, nullptr);
+  ASSERT_TRUE(new_net_op->IsNetOp());
+  auto* new_net = static_cast<NetOp*>(new_net_op.get());
+  ASSERT_EQ(2UL, new_net->ops_.size());
+  ASSERT_EQ(new_net->ops_[0]->Type(), "empty");
+  ASSERT_EQ(new_net->ops_[1]->Type(), "empty2");
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ee85b1a90a85f8c6ec57900c4c7d0dd319a0186a
--- /dev/null
+++ b/paddle/fluid/operators/norm_op.cc
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/norm_op.h"
+namespace paddle {
+namespace operators {
+
+template <typename AttrType>
+class NormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of norm operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddInput("Scale",
+             "(Tensor) The input tensor of norm operator. "
+             "The format of input tensor is C * 1.");
+    AddAttr<AttrType>("epsilon",
+                      "(float, default 1e-10) Constant "
+                      "for numerical stability.")
+        .SetDefault(1.0e-10f);
+    AddOutput("Out",
+              "(Tensor) The output tensor of norm operator."
+              "N * M."
+              "M = C * H * W");
+    AddComment(R"DOC(
+       "Input shape: $(N, C, H, W)$
+        Scale shape: $(C, 1)$
+        Output shape: $(N, C, H, W)$
+        Where
+        forward
+          $$
+            [\frac {x_{1}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{2}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{3}}{\sqrt{\sum{x_{i}^{2}}}} \cdot  \cdot  \cdot \frac {x_{n}}{\sqrt{\sum{x_{i}^{2}}}}]
+          $$
+        backward
+          $$
+            \frac{\frac{\mathrm{d}L }{\mathrm{d}y_{1}} - \frac {x_{1}\sum {\frac{\mathrm{d} L}{\mathrm{d} y_{j}}}x_{j}}{\sum x_{j}^{2}} }{\sqrt{\sum{x_{j}^{2}}}}
+          $$
+        )DOC");
+  }
+};
+
+class NormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of NormOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Scale"),
+                   "Input(Scale) of NormOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of NormOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", in_x_dims);
+  }
+};
+
+class NormOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(norm, ops::NormOp, ops::NormOpMaker<float>, norm_grad,
+            ops::NormOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    norm, ops::NormKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::NormKernel<paddle::platform::CPUDeviceContext, double, float>);
+REGISTER_OP_CPU_KERNEL(
+    norm_grad, ops::NormGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::NormGradKernel<paddle::platform::CPUDeviceContext, double, float>);
diff --git a/paddle/fluid/operators/norm_op.cu b/paddle/fluid/operators/norm_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..438bb3b86e79c526f70a23d3c7f6cc13f72e0463
--- /dev/null
+++ b/paddle/fluid/operators/norm_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#define EIGEN_USE_GPU
+
+#include "paddle/fluid/operators/norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    norm, ops::NormKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::NormKernel<paddle::platform::CUDADeviceContext, double, float>);
+REGISTER_OP_CUDA_KERNEL(
+    norm_grad, ops::NormGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::NormGradKernel<paddle::platform::CUDADeviceContext, double, float>);
diff --git a/paddle/fluid/operators/norm_op.h b/paddle/fluid/operators/norm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..db74c9b02a74afc6bd0e59da97e64a3e556b97dc
--- /dev/null
+++ b/paddle/fluid/operators/norm_op.h
@@ -0,0 +1,175 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class NormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
+    out->mutable_data<T>(context.GetPlace());
+    int batch_size = in_x->dims()[0];
+    int channels = in_x->dims()[1];
+    int height = in_x->dims()[2];
+    int width = in_x->dims()[3];
+    int fea_len = height * width;
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+    auto x =
+        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            *in_x, framework::make_ddim({batch_size, fea_len * channels}));
+    // get square
+    framework::Tensor x_square;
+    x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
+    auto x_square_eigen =
+        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            x_square, framework::make_ddim({batch_size, fea_len * channels}));
+    x_square_eigen.device(*place) = x.square();
+    auto scale_eigen =
+        framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
+            *scale);
+    for (int n = 0; n < batch_size; ++n) {
+      framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
+      auto in_x_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              in_x_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
+      auto x_square_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              x_square_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor out_batch = out->Slice(n, n + 1);
+      auto out_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              out_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor tmp_tensor;
+      tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
+                                 context.GetPlace());
+      auto tmp = framework::EigenVector<T, Eigen::RowMajor,
+                                        Eigen::DenseIndex>::Flatten(tmp_tensor);
+      // get colsum and sqrt , inverse
+      auto dim = Eigen::array<int, 1>({{0}});
+      tmp.device(*place) = x_square_batch_eigen.sum(dim);
+      tmp.device(*place) = (tmp + epsilon).sqrt().inverse();
+      Eigen::array<int, 2> broadcast_dim_col;
+      broadcast_dim_col[1] = 1;
+      broadcast_dim_col[0] = channels;
+      out_batch_eigen.device(*place) =
+          in_x_batch_eigen * (tmp.broadcast(broadcast_dim_col));
+      Eigen::array<int, 2> broadcast_dim_row;
+      broadcast_dim_row[1] = fea_len;
+      broadcast_dim_row[0] = 1;
+      out_batch_eigen.device(*place) =
+          out_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
+    }
+  }
+};
+template <typename DeviceContext, typename T, typename AttrType = T>
+class NormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    in_x_grad->mutable_data<T>(context.GetPlace());
+    int batch_size = in_x->dims()[0];
+    int channels = in_x->dims()[1];
+    int height = in_x->dims()[2];
+    int width = in_x->dims()[3];
+    int fea_len = height * width;
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+
+    auto scale_eigen =
+        framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
+            *scale);
+    auto x =
+        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            *in_x, framework::make_ddim({batch_size, fea_len * channels}));
+    // get square
+    framework::Tensor x_square;
+    x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
+    auto x_square_eigen =
+        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            x_square, framework::make_ddim({batch_size, fea_len * channels}));
+    x_square_eigen.device(*place) = x.square();
+
+    for (int n = 0; n < batch_size; ++n) {
+      framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
+      auto in_x_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              in_x_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor in_g_batch = in_x_grad->Slice(n, n + 1);
+      auto in_g_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              in_g_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
+      auto x_square_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              x_square_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor outg_batch = out_grad->Slice(n, n + 1);
+      auto outg_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              outg_batch, framework::make_ddim({channels, fea_len}));
+
+      framework::Tensor tmp_tensor;
+      tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
+                                 context.GetPlace());
+      auto tmp_eigen =
+          framework::EigenVector<T, Eigen::RowMajor,
+                                 Eigen::DenseIndex>::Flatten(tmp_tensor);
+      auto dim = Eigen::array<int, 1>({{0}});
+      tmp_eigen.device(*place) = (in_x_batch_eigen * outg_batch_eigen).sum(dim);
+      framework::Tensor norm_tmp_tensor;
+      norm_tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
+                                      context.GetPlace());
+      auto norm_tmp_eigen =
+          framework::EigenVector<T, Eigen::RowMajor,
+                                 Eigen::DenseIndex>::Flatten(norm_tmp_tensor);
+      norm_tmp_eigen.device(*place) =
+          (x_square_batch_eigen.sum(dim) + epsilon).sqrt();
+      Eigen::array<int, 2> broadcast_dim_col;
+      broadcast_dim_col[1] = 1;
+      broadcast_dim_col[0] = channels;
+      in_g_batch_eigen.device(*place) =
+          in_x_batch_eigen * tmp_eigen.broadcast(broadcast_dim_col);
+      in_g_batch_eigen.device(*place) =
+          in_g_batch_eigen /
+          (norm_tmp_eigen * norm_tmp_eigen).broadcast(broadcast_dim_col);
+      in_g_batch_eigen.device(*place) = outg_batch_eigen - in_g_batch_eigen;
+      // outg_batch_eigen + (in_g_batch_eigen * -1);
+      in_g_batch_eigen.device(*place) =
+          in_g_batch_eigen / norm_tmp_eigen.broadcast(broadcast_dim_col);
+      Eigen::array<int, 2> broadcast_dim_row;
+      broadcast_dim_row[1] = fea_len;
+      broadcast_dim_row[0] = 1;
+      in_g_batch_eigen.device(*place) =
+          in_g_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c3a60da729d6f4edb8e7e3aa0c81cd3140855c0
--- /dev/null
+++ b/paddle/fluid/operators/one_hot_op.cc
@@ -0,0 +1,95 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/one_hot_op.h"
+#include "paddle/fluid/framework/framework.pb.h"
+
+namespace paddle {
+namespace operators {
+
+class OneHotOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of OneHotOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of OneHotOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "Rank of Input(X) should be at least 2.");
+    PADDLE_ENFORCE_GE(x_dims[x_dims.size() - 1], 1U,
+                      "Last dimension of Input(X) should be 1.");
+
+    int depth = ctx->Attrs().Get<int>("depth");
+
+    PADDLE_ENFORCE_GT(depth, 0, "Should provide a positive depth (%d).", depth);
+
+    framework::DDim out_dims(x_dims);
+    out_dims[out_dims.size() - 1] = depth;
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /* --> */ "Out");
+  }
+};
+
+class OneHotOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  OneHotOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor, LoDTensor<int>) Input variable with rank at least 2. "
+             "The last dimension of X should be 1. Each value of X is an index "
+             "to indicate the position.");
+    AddOutput("Out",
+              "(Tensor, Tensor<float>) Output tensor with same rank as X. "
+              "The tensor consists of one-hot representations of values in X.");
+    AddAttr<int>("depth",
+                 "A positive integer to specify the length of one-hot vector.");
+    AddAttr<int>("dtype",
+                 "An integer to specify the data type of one-hot "
+                 "vector. The default value is FP32.")
+        .SetDefault(paddle::framework::proto::DataType::FP32);
+    AddComment(R"DOC(
+One Hot Operator. This operator creates the one-hot representations for input
+index values. The following example will help to explain the function of this
+operator:
+
+X is a LoDTensor:
+  X.lod = [[0, 1, 4]]
+  X.shape = [4, 1]
+  X.data = [[1], [1], [3], [0]]
+
+set depth = 4
+
+Out is a LoDTensor:
+  Out.lod = [[0, 1, 4]]
+  Out.shape = [4, 4]
+  Out.data = [[0., 1., 0., 0.],
+              [0., 1., 0., 0.],
+              [0., 0., 0., 1.],
+              [1., 0., 0., 0.]]
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(one_hot, ops::OneHotOp, ops::OneHotOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    one_hot, ops::OneHotKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::OneHotKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6a8061edaab61661f57b17cd5e065c5c84edb906
--- /dev/null
+++ b/paddle/fluid/operators/one_hot_op.cu
@@ -0,0 +1,80 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/one_hot_op.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename InT, typename OutT>
+__global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data,
+                                 const int64_t numel, const int depth) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < numel) {
+    *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0;
+  }
+}
+
+template <typename DeviceContext, typename InT>
+struct OneHotOpCUDAFunctor {
+  const framework::LoDTensor* in_;
+  framework::LoDTensor* out_;
+  const DeviceContext& ctx_;
+  int depth_;
+
+  OneHotOpCUDAFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
+                      int depth, const DeviceContext& ctx)
+      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
+
+  template <typename OutT>
+  void operator()() const {
+    auto* p_in_data = in_->data<InT>();
+    auto numel = in_->numel();
+    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
+    auto stream = ctx_.stream();
+    math::set_constant(ctx_, out_, 0.0);
+
+    FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
+                           PADDLE_CUDA_NUM_THREADS,
+                       PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        p_in_data, p_out_data, numel, depth_);
+  }
+};
+
+using LoDTensor = framework::LoDTensor;
+template <typename DeviceContext, typename T>
+class OneHotCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    int depth = context.Attr<int>("depth");
+
+    framework::VisitDataType(
+        static_cast<framework::proto::DataType>(context.Attr<int>("dtype")),
+        OneHotOpCUDAFunctor<DeviceContext, T>(
+            in, out, depth, context.template device_context<DeviceContext>()));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    one_hot, ops::OneHotCUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::OneHotCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/one_hot_op.h b/paddle/fluid/operators/one_hot_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ddac6edd0ec73a932dcf9c6ca7d7d63853467f1c
--- /dev/null
+++ b/paddle/fluid/operators/one_hot_op.h
@@ -0,0 +1,68 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename InT>
+struct OneHotOpFunctor {
+  const framework::LoDTensor* in_;
+  framework::LoDTensor* out_;
+  int depth_;
+  const DeviceContext& ctx_;
+
+  OneHotOpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
+                  int depth, const DeviceContext& ctx)
+      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
+
+  template <typename OutT>
+  void operator()() const {
+    auto* p_in_data = in_->data<InT>();
+    auto numel = in_->numel();
+    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
+    math::set_constant(ctx_, out_, 0.0);
+
+    for (int i = 0; i < numel; ++i) {
+      PADDLE_ENFORCE_GE(p_in_data[i], 0,
+                        "Illegal index value, should be at least 0.");
+      PADDLE_ENFORCE_LT(p_in_data[i], depth_,
+                        "Illegal index value, should be less than depth (%d).",
+                        depth_);
+      *(p_out_data + i * depth_ + p_in_data[i]) = 1.0;
+    }
+  }
+};
+
+using LoDTensor = framework::LoDTensor;
+template <typename DeviceContext, typename T>
+class OneHotKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    int depth = context.Attr<int>("depth");
+
+    framework::VisitDataType(
+        static_cast<framework::proto::DataType>(context.Attr<int>("dtype")),
+        OneHotOpFunctor<DeviceContext, T>(
+            in, out, depth, context.template device_context<DeviceContext>()));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/op_documentation/batch_norm_op.md b/paddle/fluid/operators/op_documentation/batch_norm_op.md
similarity index 100%
rename from paddle/operators/op_documentation/batch_norm_op.md
rename to paddle/fluid/operators/op_documentation/batch_norm_op.md
diff --git a/paddle/operators/op_documentation/name_convention.md b/paddle/fluid/operators/op_documentation/name_convention.md
similarity index 100%
rename from paddle/operators/op_documentation/name_convention.md
rename to paddle/fluid/operators/op_documentation/name_convention.md
diff --git a/paddle/operators/op_documentation/net_op_design.md b/paddle/fluid/operators/op_documentation/net_op_design.md
similarity index 100%
rename from paddle/operators/op_documentation/net_op_design.md
rename to paddle/fluid/operators/op_documentation/net_op_design.md
diff --git a/paddle/operators/op_documentation/op_markdown_format.md b/paddle/fluid/operators/op_documentation/op_markdown_format.md
similarity index 100%
rename from paddle/operators/op_documentation/op_markdown_format.md
rename to paddle/fluid/operators/op_documentation/op_markdown_format.md
diff --git a/paddle/operators/op_documentation/rnn_design.md b/paddle/fluid/operators/op_documentation/rnn_design.md
similarity index 100%
rename from paddle/operators/op_documentation/rnn_design.md
rename to paddle/fluid/operators/op_documentation/rnn_design.md
diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4b021fde7cba699327e1874d569a14c3139a4c32
--- /dev/null
+++ b/paddle/fluid/operators/pad_op.cc
@@ -0,0 +1,140 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/pad_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class PadOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of PadOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of PadOp should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    PADDLE_ENFORCE_EQ(x_dim.size() * 2, int64_t(paddings.size()),
+                      "Size of paddings should be equal to 2 * dimension size "
+                      "of input tensor.");
+    std::vector<int64_t> out_dims(x_dim.size());
+    for (int i = 0; i < x_dim.size(); ++i) {
+      out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+    if (out_dims[0] == x_dim[0]) {
+      // Only pass LoD when the first dimension is equal between
+      // output and input.
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
+  }
+};
+
+class PadOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PadOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input of pad op. "
+             "The input should be a k-D tensor(k > 0 and k < 7)");
+    AddOutput("Out",
+              "The output of pad op. "
+              "A tensor with the same shape as X.");
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "(vector<int>) "
+        "A list<int> to describe the padding rules for each dimension. "
+        "For 2-D image tensor, paddings=[0, 1, 2, 3] means "
+        "padding 0 row to top, 1 row to bottom, 2 columns to left "
+        "and 3 columns to right. Size of paddings should be equal to "
+        "2 * dimension size of the input tensor.");
+    AddAttr<float>("pad_value",
+                   "(float, default 0.0) "
+                   "The value to fill the padded areas.")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+Pad Operator.
+
+Pad input into output, as specified by paddings and pad_value. 
+The input should be a k-D tensor(k > 0 and k < 7). As an example:
+
+Given:
+
+X = [[1, 2],
+     [3, 4]],
+
+paddings = [0, 1, 1, 2],
+
+and
+
+pad_value = 0,
+
+we have:
+
+Out = [[0, 1, 2, 0, 0]
+       [0, 3, 4, 0, 0]
+       [0, 0, 0, 0, 0]]
+
+)DOC");
+  }
+};
+
+class PadOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+};
+
+class PadOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* bind = new framework::OpDesc();
+    bind->SetInput("X", Input("X"));
+    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    bind->SetAttrMap(Attrs());
+    bind->SetType("pad_grad");
+    return std::unique_ptr<framework::OpDesc>(bind);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ops::PadOpGradMaker);
+REGISTER_OPERATOR(pad_grad, ops::PadOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    pad, ops::PadKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    pad_grad, ops::PadGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/pad_op.cu b/paddle/fluid/operators/pad_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..203c31440371440b5e942452dab08978e2136275
--- /dev/null
+++ b/paddle/fluid/operators/pad_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/pad_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    pad, ops::PadKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    pad_grad, ops::PadGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..244d8f9b6cf51ab249e991ed129e99da67ff9e62
--- /dev/null
+++ b/paddle/fluid/operators/pad_op.h
@@ -0,0 +1,134 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T, size_t D>
+void PadFunction(const framework::ExecutionContext& context) {
+  auto pads = context.Attr<std::vector<int>>("paddings");
+  Eigen::array<std::pair<int, int>, D> paddings;
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    paddings[i].first = pads[i * 2];
+    paddings[i].second = pads[i * 2 + 1];
+  }
+  T pad_value = context.Attr<T>("pad_value");
+
+  auto* x = context.Input<Tensor>("X");
+  auto* out = context.Output<Tensor>("Out");
+  out->mutable_data<T>(context.GetPlace());
+
+  auto x_tensor = EigenTensor<T, D>::From(*x);
+  auto out_tensor = EigenTensor<T, D>::From(*out);
+  auto& place =
+      *context.template device_context<DeviceContext>().eigen_device();
+  out_tensor.device(place) = x_tensor.pad(paddings, pad_value);
+}
+
+template <typename DeviceContext, typename T>
+class PadKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    int rank = context.Input<Tensor>("X")->dims().size();
+    switch (rank) {
+      case 1:
+        PadFunction<DeviceContext, T, 1>(context);
+        break;
+      case 2:
+        PadFunction<DeviceContext, T, 2>(context);
+        break;
+      case 3:
+        PadFunction<DeviceContext, T, 3>(context);
+        break;
+      case 4:
+        PadFunction<DeviceContext, T, 4>(context);
+        break;
+      case 5:
+        PadFunction<DeviceContext, T, 5>(context);
+        break;
+      case 6:
+        PadFunction<DeviceContext, T, 6>(context);
+        break;
+      default:
+        PADDLE_THROW(
+            "PadOp only support tensors with no more than 6 dimensions.");
+    }
+  }
+};
+
+template <typename DeviceContext, typename T, size_t D>
+void PadGradFunction(const framework::ExecutionContext& context) {
+  auto pads = context.Attr<std::vector<int>>("paddings");
+  Eigen::array<std::pair<int, int>, D> paddings;
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    paddings[i].first = -pads[i * 2];
+    paddings[i].second = -pads[i * 2 + 1];
+  }
+  auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+  auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
+  if (d_x != nullptr) {
+    d_x->mutable_data<T>(context.GetPlace());
+    auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
+    auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0);
+  }
+}
+
+template <typename DeviceContext, typename T>
+class PadGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    size_t rank =
+        context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
+    switch (rank) {
+      case 1:
+        PadGradFunction<DeviceContext, T, 1>(context);
+        break;
+      case 2:
+        PadGradFunction<DeviceContext, T, 2>(context);
+        break;
+      case 3:
+        PadGradFunction<DeviceContext, T, 3>(context);
+        break;
+      case 4:
+        PadGradFunction<DeviceContext, T, 4>(context);
+        break;
+      case 5:
+        PadGradFunction<DeviceContext, T, 5>(context);
+        break;
+      case 6:
+        PadGradFunction<DeviceContext, T, 6>(context);
+        break;
+      default:
+        PADDLE_THROW(
+            "PadOp only support tensors with no more than 6 dimensions.");
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff5730bfe77a909ae59e18361c6157831c7eff23
--- /dev/null
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -0,0 +1,398 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+
+namespace paddle {
+namespace operators {
+
+static constexpr char kInputs[] = "inputs";
+static constexpr char kParameters[] = "parameters";
+static constexpr char kPlaces[] = "places";
+
+static constexpr char kOutputs[] = "outputs";
+static constexpr char kParallelScopes[] = "parallel_scopes";
+
+static constexpr char kParallelBlock[] = "sub_block";
+static constexpr char kUseNCCL[] = "use_nccl";
+
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
+
+static void SplitTensorAndMoveTensorToScopes(
+    const framework::Scope &scope, std::vector<framework::Scope *> *sub_scopes,
+    const std::vector<platform::Place> &places,
+    const std::vector<std::string> &names) {
+  size_t num_sub_scopes = 0;
+  for (auto &argu : names) {
+    const auto &tensor =
+        detail::Ref(scope.FindVar(argu),
+                    "Cannot find variable %s in the parent scope", argu)
+            .Get<LoDTensor>();
+    auto lod_tensors = tensor.SplitLoDTensor(places);
+
+    for (auto &lod : lod_tensors) {
+      VLOG(3) << lod.dims();
+    }
+    if (num_sub_scopes == 0) {
+      num_sub_scopes = lod_tensors.size();
+    } else {
+      PADDLE_ENFORCE_EQ(num_sub_scopes, lod_tensors.size());
+    }
+    PADDLE_ENFORCE_NE(num_sub_scopes, 0);
+    if (sub_scopes->size() == 0) {
+      sub_scopes->reserve(num_sub_scopes);
+      for (size_t i = 0; i < num_sub_scopes; ++i) {
+        sub_scopes->emplace_back(&scope.NewScope());
+      }
+    }
+
+    for (size_t i = 0; i < lod_tensors.size(); ++i) {
+      *detail::Ref(sub_scopes->at(i)->Var(argu),
+                   "Cannot find variable in the sub-scope", argu)
+           .GetMutable<LoDTensor>() = lod_tensors[i];
+    }
+  }
+}
+
+inline void CopyOrShare(const framework::Variable &src,
+                        const platform::Place &dst_place,
+                        framework::Variable *dst) {
+  if (src.IsType<LoDTensor>()) {
+    if (src.Get<LoDTensor>().place() == dst_place) {
+      dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>());
+      dst->GetMutable<LoDTensor>()->set_lod(src.Get<LoDTensor>().lod());
+    } else {
+      Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
+    }
+  } else if (src.IsType<SelectedRows>()) {
+    auto &src_sr = src.Get<SelectedRows>();
+    auto *dst_sr = dst->GetMutable<SelectedRows>();
+    dst_sr->set_height(src_sr.height());
+    if (src_sr.value().place() == dst_place) {
+      dst_sr->mutable_value()->ShareDataWith(src_sr.value());
+      dst_sr->set_rows(src_sr.rows());
+    } else {
+      Copy(src_sr.value(), dst_place, dst_sr->mutable_value());
+    }
+  } else {
+    PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
+  }
+}
+
+void WaitOnPlace(const platform::Place place) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto &dev_ctx = *pool.Get(place);
+  dev_ctx.Wait();
+}
+
+void WaitOnPlaces(const std::vector<platform::Place> places) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+
+  for (auto &place : places) {
+    auto &dev_ctx = *pool.Get(place);
+    dev_ctx.Wait();
+  }
+}
+
+class ParallelDoOp : public framework::OperatorBase {
+ public:
+  ParallelDoOp(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
+    auto *program = block->Program();
+
+    auto &places = scope.FindVar(Input(kPlaces))->Get<platform::PlaceList>();
+
+    auto &sub_scopes = *scope.FindVar(Output(kParallelScopes))
+                            ->GetMutable<std::vector<framework::Scope *>>();
+
+    // split input
+    SplitTensorAndMoveTensorToScopes(scope, &sub_scopes, places,
+                                     Inputs(kInputs));
+
+    // copy parameter
+    for (auto &param : Inputs(kParameters)) {
+      PADDLE_ENFORCE(scope.FindVar(param)->IsType<LoDTensor>(),
+                     "Only support parameter type as LoDTensor");
+      auto &src = scope.FindVar(param)->Get<LoDTensor>();
+      for (size_t i = 0; i < sub_scopes.size(); ++i) {
+        auto &place = places[i];
+        auto *sub_scope = sub_scopes[i];
+        auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
+        framework::Copy(src, place, dst);
+      }
+    }
+    WaitOnPlaces(places);
+
+    //    PADDLE_ENFORCE_EQ(places.size(), sub_scopes.size());
+    std::vector<std::future<void>> workers;
+    workers.reserve(places.size());
+    for (size_t place_idx = 0; place_idx < sub_scopes.size(); ++place_idx) {
+      auto &place = places[place_idx];
+      auto *cur_scope = sub_scopes[place_idx];
+
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
+        framework::Executor executor(place);
+        executor.Run(*program, cur_scope, block->ID(),
+                     false /*create_local_scope*/);
+      }));
+    }
+    for (auto &worker : workers) {
+      worker.wait();
+    }
+    WaitOnPlaces(places);
+
+    // merge output
+    for (auto &o_name : Outputs(kOutputs)) {
+      std::vector<const framework::LoDTensor *> lod_tensors;
+      lod_tensors.reserve(sub_scopes.size());
+      for (auto *sub_scope : sub_scopes) {
+        lod_tensors.emplace_back(&sub_scope->FindVar(o_name)->Get<LoDTensor>());
+      }
+
+      auto *lod_tensor_to_be_merged =
+          scope.FindVar(o_name)->GetMutable<LoDTensor>();
+      lod_tensor_to_be_merged->MergeLoDTensor(lod_tensors, dev_ctx.GetPlace());
+    }
+    WaitOnPlaces(places);
+  }
+};
+
+class ParallelDoOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ParallelDoOpProtoMaker(OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kInputs, "").AsDuplicable();
+    AddInput(kParameters, "").AsDuplicable();
+    AddInput(kPlaces, "");
+    AddOutput(kOutputs, "").AsDuplicable();
+    AddOutput(kParallelScopes, "");
+    AddAttr<framework::BlockDesc *>(kParallelBlock, "");
+    AddAttr<bool>(kUseNCCL, "true if we use nccl on backward")
+        .SetDefault(false);
+    AddComment(R"DOC(
+ParallelDo Operator.
+)DOC");
+  }
+};
+
+class ParallelDoGradOp : public framework::OperatorBase {
+ public:
+  ParallelDoGradOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
+    auto *program = block->Program();
+
+    auto &sub_scopes = scope.FindVar(Input(kParallelScopes))
+                           ->Get<std::vector<framework::Scope *>>();
+    auto &places = scope.FindVar(Input(kPlaces))->Get<platform::PlaceList>();
+    //    PADDLE_ENFORCE_EQ(places.size(), sub_scopes.size());
+
+    // feed output@grad
+    SplitTensorAndMoveTensorToScopes(
+        scope, const_cast<std::vector<framework::Scope *> *>(&sub_scopes),
+        places, Inputs(framework::GradVarName(kOutputs)));
+    WaitOnPlaces(places);
+    LOG(INFO) << "places " << places.size();
+
+    // exe run
+    std::vector<std::future<void>> workers;
+    for (size_t i = 0; i < sub_scopes.size(); ++i) {
+      auto &place = places[i];
+      auto *cur_scope = sub_scopes[i];
+      LOG(INFO) << place;
+
+      // execute
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
+        framework::Executor executor(place);
+        executor.Run(*program, cur_scope, block->ID(),
+                     false /*create_local_scope*/);
+      }));
+    }
+    LOG(INFO) << "places " << places.size();
+    for (auto &worker : workers) {
+      worker.wait();
+    }
+    WaitOnPlaces(places);
+
+    // NCCL allreduce op will be added by backward,
+    // so no need to explicitly accumulate grad
+    if (!(Attr<bool>(kUseNCCL))) {
+      AccumulateGrad(scope, place, sub_scopes, places);
+    } else {
+      for (auto &place : places) {
+        PADDLE_ENFORCE(platform::is_gpu_place(place),
+                       "NCCL only supports cuda place");
+      }
+    }
+    for (auto &s : Outputs(framework::GradVarName(kParameters))) {
+      CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
+    }
+    WaitOnPlaces(places);
+  }
+
+  void AccumulateGrad(const framework::Scope &scope,
+                      const platform::Place &place,
+                      const std::vector<framework::Scope *> &sub_scopes,
+                      const platform::PlaceList &places) const {
+    for (auto &s : Outputs(framework::GradVarName(kParameters))) {
+      VLOG(3) << "Accumulating " << s;
+      if (s == framework::kEmptyVarName) continue;
+      std::string tmp_name;
+      auto *tmp = sub_scopes[0]->Var(&tmp_name);
+
+      for (size_t i = 1; i < sub_scopes.size(); ++i) {
+        CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp);
+        WaitOnPlaces(places);
+
+        auto sum_op = framework::OpRegistry::CreateOp(
+            "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
+            framework::AttributeMap{});
+        VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
+        sum_op->Run(*sub_scopes[0], places[0]);
+        WaitOnPlace(places[0]);
+      }
+
+      CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
+    }
+    WaitOnPlaces(places);
+  }
+};
+
+std::ostream &operator<<(std::ostream &sout,
+                         const std::vector<std::string> &strs) {
+  std::copy(strs.begin(), strs.end(),
+            std::ostream_iterator<std::string>(sout, ","));
+  return sout;
+}
+
+class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  virtual std::unique_ptr<framework::OpDesc> Apply() const {
+    auto *grad = new framework::OpDesc();
+    grad->SetType("parallel_do_grad");
+    for (auto &input_param : this->InputNames()) {
+      VLOG(3) << input_param;
+      grad->SetInput(input_param, this->Input(input_param));
+      if (input_param != kPlaces) {
+        grad->SetOutput(framework::GradVarName(input_param),
+                        this->InputGrad(input_param, false));
+      }
+    }
+    auto *g_block = this->grad_block_[0];
+
+    // All variable name that needed by gradient operators
+    std::unordered_set<std::string> all_inputs_in_grad_blocks;
+
+    for (size_t i = 0; i < g_block->OpSize(); ++i) {
+      auto *op = g_block->Op(i);
+      for (auto &var_name : op->InputArgumentNames()) {
+        all_inputs_in_grad_blocks.insert(var_name);
+      }
+    }
+
+    for (auto &output_param : this->OutputNames()) {
+      if (output_param == kParallelScopes) {
+        grad->SetInput(output_param, this->Output(output_param));
+        grad->SetInput(framework::GradVarName(output_param),
+                       this->Output(output_param));
+      } else {
+        grad->SetInput(output_param, this->Output(output_param));
+        std::vector<std::string> og_names;
+        for (auto &og_name : this->OutputGrad(output_param)) {
+          if (all_inputs_in_grad_blocks.count(og_name) != 0) {
+            // there are some gradient operators who need the OG. So make this
+            // OG as an input of parallel.do
+            og_names.push_back(og_name);
+          }
+          // else, there is no operator who need the OG. Do not use this OG as
+          // an input
+        }
+        grad->SetInput(framework::GradVarName(output_param), og_names);
+      }
+    }
+    grad->SetAttrMap(this->Attrs());
+    grad->SetBlockAttr(kParallelBlock, *grad_block_[0]);
+
+    return std::unique_ptr<framework::OpDesc>(grad);
+  }
+};
+
+class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInputs(kParameters));
+    PADDLE_ENFORCE(ctx->HasInputs(kInputs));
+    PADDLE_ENFORCE(ctx->HasInputs(kOutputs));
+
+    ctx->SetOutputsDim(framework::GradVarName(kParameters),
+                       ctx->GetInputsDim(kParameters));
+
+    auto i_dims = ctx->GetInputsDim(kInputs);
+    auto ig_names = ctx->Outputs(framework::GradVarName(kInputs));
+
+    for (size_t i = 0; i < ig_names.size(); ++i) {
+      auto &ig_name = ig_names[i];
+      if (ig_name == framework::kEmptyVarName) {
+        continue;
+      }
+
+      ctx->SetDims({ig_name}, {i_dims[i]});
+    }
+
+    auto p_dims = ctx->GetInputsDim(kParameters);
+    auto pg_names = ctx->Outputs(framework::GradVarName(kParameters));
+    for (size_t i = 0; i < pg_names.size(); ++i) {
+      auto &pg_name = pg_names[i];
+      if (pg_name == framework::kEmptyVarName) {
+        continue;
+      }
+      ctx->SetDims({pg_name}, {p_dims[i]});
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp,
+                  paddle::operators::ParallelDoOpProtoMaker,
+                  paddle::operators::ParallelDoGradOpDescMaker);
+REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp,
+                  paddle::operators::ParallelDoGradOpShapeInference);
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..75984b7721c48526c6d11c4b82004dba1c166cc4
--- /dev/null
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/pool_op.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor;
+using DataLayout = platform::DataLayout;
+using PoolingMode = platform::PoolingMode;
+
+template <typename T>
+class PoolCUDNNOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+
+    const Tensor *input = ctx.Input<Tensor>("X");
+    Tensor *output = ctx.Output<Tensor>("Out");
+
+    const T *input_data = input->data<T>();
+    T *output_data = output->mutable_data<T>(ctx.GetPlace());
+
+    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
+    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    if (ctx.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(input->dims()[i + 2]);
+      }
+    }
+
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedPoolingDescriptor pool_desc;
+    DataLayout layout;
+
+    if (strides.size() == 2U) {
+      layout = DataLayout::kNCHW;
+    } else {
+      layout = DataLayout::kNCDHW;
+    }
+
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()));
+    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+        layout, framework::vectorize2int(output->dims()));
+
+    PoolingMode pooling_mode;
+    if (pooling_type == "max") {
+      pooling_mode = PoolingMode::kMaximum;
+    } else {
+      pooling_mode = PoolingMode::kAverage;
+    }
+
+    cudnnPoolingDescriptor_t cudnn_pool_desc =
+        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
+
+    // ------------------- cudnn pool algorithm ---------------------
+    auto handle = ctx.cuda_device_context().cudnn_handle();
+    T alpha = 1.0f, beta = 0.0f;
+
+    PADDLE_ENFORCE(platform::dynload::cudnnPoolingForward(
+        handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
+        cudnn_output_desc, output_data));
+  }
+};
+
+template <typename T>
+class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+
+    const Tensor *input = ctx.Input<Tensor>("X");
+    const Tensor *output = ctx.Input<Tensor>("Out");
+    const Tensor *output_grad =
+        ctx.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor *input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
+    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+
+    if (ctx.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(input->dims()[i + 2]);
+      }
+    }
+
+    const T *input_data = input->data<T>();
+    const T *output_data = output->data<T>();
+    const T *output_grad_data = output_grad->data<T>();
+
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedPoolingDescriptor pool_desc;
+    DataLayout layout;
+
+    if (strides.size() == 2U) {
+      layout = DataLayout::kNCHW;
+    } else {
+      layout = DataLayout::kNCDHW;
+    }
+
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()));
+    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+        layout, framework::vectorize2int(output->dims()));
+
+    PoolingMode pooling_mode;
+    if (pooling_type == "max") {
+      pooling_mode = PoolingMode::kMaximum;
+    } else {
+      pooling_mode = PoolingMode::kAverage;
+    }
+
+    cudnnPoolingDescriptor_t cudnn_pool_desc =
+        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
+
+    // ------------------- cudnn pool algorithm ---------------------
+    auto handle = ctx.cuda_device_context().cudnn_handle();
+    T alpha = 1.0f, beta = 0.0f;
+
+    if (input_grad) {
+      T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+      // Because beta is zero, it is unnecessary to reset input_grad.
+
+      PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward(
+          handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
+          cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
+          &beta, cudnn_input_desc, input_grad_data));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(pool2d, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::PoolCUDNNOpKernel<float>,
+                   ops::PoolCUDNNOpKernel<double>);
+REGISTER_OP_KERNEL(pool2d_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::PoolCUDNNGradOpKernel<float>,
+                   ops::PoolCUDNNGradOpKernel<double>);
+
+REGISTER_OP_KERNEL(pool3d, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::PoolCUDNNOpKernel<float>,
+                   ops::PoolCUDNNOpKernel<double>);
+REGISTER_OP_KERNEL(pool3d_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::PoolCUDNNGradOpKernel<float>,
+                   ops::PoolCUDNNGradOpKernel<double>);
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9dd33eefc5fd18ba08dce5ea8dff791cda54332c
--- /dev/null
+++ b/paddle/fluid/operators/pool_op.cc
@@ -0,0 +1,306 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/pool_op.h"
+
+namespace paddle {
+namespace operators {
+
+int OutputSizePool(int input_size, int filter_size, int padding, int stride) {
+  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  return output_size;
+}
+
+void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) of Pooling should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                 "Out(Output) of Pooling should not be null.");
+
+  auto in_x_dims = ctx->GetInputDim("X");
+
+  std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
+  std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+
+  PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
+                 "Pooling intput should be 4-D or 5-D tensor.");
+
+  if (ctx->Attrs().Get<bool>("global_pooling")) {
+    ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      paddings[i] = 0;
+      ksize[i] = static_cast<int>(in_x_dims[i + 2]);
+    }
+  }
+
+  PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
+                 "Input size and pooling size should be consistent.");
+  PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
+                    "Strides size and pooling size should be the same.");
+  PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
+                    "Paddings size and pooling size should be the same.");
+
+  std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+  for (size_t i = 0; i < ksize.size(); ++i) {
+    output_shape.push_back(
+        OutputSizePool(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+  }
+  ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  ctx->ShareLoD("X", "Out");
+}
+
+framework::OpKernelType PoolOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
+  framework::LibraryType library_;
+  if (use_cudnn) {
+    library_ = framework::LibraryType::kCUDNN;
+  } else {
+    library_ = framework::LibraryType::kPlain;
+  }
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+
+void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                 "Input(X@GRAD) should not be null.");
+  ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+}
+
+framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
+  framework::LibraryType library_;
+  if (use_cudnn) {
+    library_ = framework::LibraryType::kCUDNN;
+  } else {
+    library_ = framework::LibraryType::kPlain;
+  }
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+
+Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "X",
+      "(Tensor) The input tensor of pooling operator. "
+      "The format of input tensor is NCHW, where N is batch size, C is the "
+      "number of channels, H is the height of the feature, "
+      "and W is the width of the feature.");
+  AddOutput("Out",
+            "(Tensor) The output tensor of pooling operator. "
+            "The format of output tensor is also NCHW, "
+            "where N is batch size, C is the number of channels, "
+            "H is the height of the feature, "
+            "and W is the width of the feature.");
+
+  AddAttr<std::string>("pooling_type",
+                       "(string), pooling type, can be \"max\" for max-pooling "
+                       "and \"avg\" for average-pooling.")
+      .InEnum({"max", "avg"});
+  AddAttr<std::vector<int>>("ksize",
+                            "(vector<int>) The pooling window "
+                            "size(height, width) of the pooling operator. "
+                            "If global_pooling = true, ksize and paddings will "
+                            "be ignored.");  // TODO(Chengduo): Add checker.
+                                             // (Currently,
+  // TypedAttrChecker don't support vector type.)
+  AddAttr<bool>("global_pooling",
+                "(bool, default false) Whether to use the global pooling. "
+                "If global_pooling = true, ksize and paddings will be ignored.")
+      .SetDefault(false);
+  AddAttr<std::vector<int>>("strides",
+                            "(vector<int>, default {1, 1}), strides(height, "
+                            "width) of pooling operator.")
+      .SetDefault({1, 1});
+  // TODO(Chengduo): Add checker. (Currently,
+  // TypedAttrChecker don't support vector type.)
+  AddAttr<std::vector<int>>(
+      "paddings",
+      "(vector<int>, default {0,0}), paddings(height, width) of pooling "
+      "operator."
+      "If global_pooling = true, paddings and ksize will be ignored.")
+      .SetDefault({0, 0});
+  AddAttr<bool>(
+      "use_cudnn",
+      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      .SetDefault(false);
+  AddAttr<std::string>(
+      "data_format",
+      "(string, default NCHW) Only used in "
+      "An optional string from: \"NHWC\", \"NCHW\". "
+      "Defaults to \"NHWC\". Specify the data format of the output data, "
+      "the input will be transformed automatically. ")
+      .SetDefault("AnyLayout");
+  // TODO(dzhwinter): need to registered layout transform function
+
+  AddComment(R"DOC(
+Pool2d Operator.
+
+The pooling2d operation calculates the output based on
+the input, pooling_type and ksize, strides, paddings parameters.
+Input(X) and output(Out) are in NCHW format, where N is batch size, C is the
+number of channels, H is the height of the feature, and W is the width of the feature.
+Parameters(ksize, strides, paddings) are two elements.
+These two elements represent height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:   
+  Input:
+       X shape: $(N, C, H_{in}, W_{in})$
+  Output:
+       Out shape: $(N, C, H_{out}, W_{out})$
+  Where
+       $$ 
+       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
+       $$
+
+)DOC");
+}
+
+Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput("X",
+           "(Tensor) The input tensor of pooling operator. "
+           "The format of input tensor is NCDHW, where N is batch size, C is "
+           "the number of channels, and D, H and W is the depth, height and "
+           "width of "
+           "the feature, respectively.");
+  AddOutput("Out",
+            "(Tensor) The output tensor of pooling operator."
+            "The format of output tensor is also NCDHW, "
+            "where N is batch size, C is "
+            "the number of channels, and D, H and W is the depth, height and "
+            "width of the feature, respectively.");
+
+  AddAttr<std::string>("pooling_type",
+                       "(string) Pooling type, can be \"max\" for max-pooling "
+                       "and \"avg\" for average-pooling.")
+      .InEnum({"max", "avg"});
+  AddAttr<std::vector<int>>(
+      "ksize",
+      "(vector<int>) The pooling window size(depth, height, "
+      "width) of pooling operator. "
+      "If global_pooling = true, ksize and paddings will "
+      "be ignored.");  // TODO(Chengduo): Add checker.
+                       // (Currently,
+  // TypedAttrChecker don't support vector type.)
+  AddAttr<bool>(
+      "global_pooling",
+      "(bool, default false) Whether to use the global pooling. "
+      "If global_pooling = true, ksize and paddings wille be ignored.")
+      .SetDefault(false);
+  AddAttr<std::vector<int>>(
+      "strides",
+      "(vector<int>, default {1,1,1}) Strides(depth, height, "
+      "width) of the pooling operator.")
+      .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
+                               // TypedAttrChecker don't support vector type.)
+  AddAttr<std::vector<int>>(
+      "paddings",
+      "(vector<int>, default {0,0,0}), paddings(depth, height, "
+      "width) of pooling operator. "
+      "If global_pooling = true, ksize and paddings will be ignored.")
+      .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
+                               // TypedAttrChecker don't support vector type.)
+
+  AddAttr<bool>(
+      "use_cudnn",
+      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      .SetDefault(false);
+  AddAttr<std::string>(
+      "data_format",
+      "(string, default NCHW) Only used in "
+      "An optional string from: \"NHWC\", \"NCHW\". "
+      "Defaults to \"NHWC\". Specify the data format of the output data, "
+      "the input will be transformed automatically. ")
+      .SetDefault("AnyLayout");
+  // TODO(dzhwinter): need to registered layout transform function
+
+  AddComment(R"DOC(
+Pool3d Operator.
+
+The pooling3d operation calculates the output based on
+the input, pooling_type, ksize, strides, and paddings parameters.
+Input(X) and output(Out) are in NCDHW format, where N is batch
+size, C is the number of channels, and D, H and W are the depth, height and
+width of the feature, respectively. Parameters(ksize, strides, paddings) 
+are three elements. These three elements represent depth, height and 
+width, respectively. The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       X shape: $(N, C, D_{in}, H_{in}, W_{in})$
+  Output:
+       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
+  $$
+
+)DOC");
+}
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad,
+            ops::PoolOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    pool2d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    pool2d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>)
+
+REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad,
+            ops::PoolOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    pool3d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    pool3d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/pool_op.cu.cc b/paddle/fluid/operators/pool_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..14486c07402af387ee11a127ba193ac9ac36c8a2
--- /dev/null
+++ b/paddle/fluid/operators/pool_op.cu.cc
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/pool_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    pool2d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    pool2d_grad,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    pool3d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    pool3d_grad,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4cabd634d66e402282f17ed8724129a3e6e1ff43
--- /dev/null
+++ b/paddle/fluid/operators/pool_op.h
@@ -0,0 +1,183 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/pooling.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class PoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class PoolOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Pool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+};
+
+class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Pool3dOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+};
+
+template <typename DeviceContext, typename T>
+class PoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+
+    std::string pooling_type = context.Attr<std::string>("pooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    if (context.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+      }
+    }
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    switch (ksize.size()) {
+      case 2: {
+        if (pooling_type == "max") {
+          paddle::operators::math::Pool2dFunctor<
+              DeviceContext, paddle::operators::math::MaxPool<T>, T>
+              pool2d_forward;
+          paddle::operators::math::MaxPool<T> pool_process;
+          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+                         out);
+
+        } else if (pooling_type == "avg") {
+          paddle::operators::math::Pool2dFunctor<
+              DeviceContext, paddle::operators::math::AvgPool<T>, T>
+              pool2d_forward;
+          paddle::operators::math::AvgPool<T> pool_process;
+          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+                         out);
+        }
+      } break;
+      case 3: {
+        if (pooling_type == "max") {
+          paddle::operators::math::Pool3dFunctor<
+              DeviceContext, paddle::operators::math::MaxPool<T>, T>
+              pool3d_forward;
+          paddle::operators::math::MaxPool<T> pool_process;
+          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+                         out);
+        } else if (pooling_type == "avg") {
+          paddle::operators::math::Pool3dFunctor<
+              DeviceContext, paddle::operators::math::AvgPool<T>, T>
+              pool3d_forward;
+          paddle::operators::math::AvgPool<T> pool_process;
+          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+                         out);
+        }
+      } break;
+      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class PoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    const Tensor* out = context.Input<Tensor>("Out");
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+
+    std::string pooling_type = context.Attr<std::string>("pooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    if (context.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+      }
+    }
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      paddle::operators::math::SetConstant<DeviceContext, T> set_constant;
+      set_constant(dev_ctx, in_x_grad, 0.0);
+
+      switch (ksize.size()) {
+        case 2: {
+          if (pooling_type == "max") {
+            paddle::operators::math::MaxPool2dGradFunctor<DeviceContext, T>
+                pool2d_backward;
+            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
+                            paddings, in_x_grad);
+          } else if (pooling_type == "avg") {
+            paddle::operators::math::Pool2dGradFunctor<
+                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
+                pool2d_backward;
+            paddle::operators::math::AvgPoolGrad<T> pool_process;
+            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
+                            paddings, pool_process, in_x_grad);
+          }
+        } break;
+        case 3: {
+          if (pooling_type == "max") {
+            paddle::operators::math::MaxPool3dGradFunctor<DeviceContext, T>
+                pool3d_backward;
+            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
+                            paddings, in_x_grad);
+          } else if (pooling_type == "avg") {
+            paddle::operators::math::Pool3dGradFunctor<
+                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
+                pool3d_backward;
+            paddle::operators::math::AvgPoolGrad<T> pool_process;
+            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
+                            paddings, pool_process, in_x_grad);
+          }
+        } break;
+        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ef6d5d867b2d38b3ca26deb1cbd9f16ca9846d0f
--- /dev/null
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -0,0 +1,291 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/pool_with_index_op.h"
+
+namespace paddle {
+namespace operators {
+
+inline int OutputSizeMaxPool(int input_size, int filter_size, int padding,
+                             int stride) {
+  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  return output_size;
+}
+
+class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of Pooling should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of Pooling should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Mask"),
+                   "Output(Mask) of Pooling should not be null.");
+
+    auto in_x_dims = ctx->GetInputDim("X");
+
+    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+
+    PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
+                   "Pooling intput should be 4-D or 5-D tensor.");
+
+    if (ctx->Attrs().Get<bool>("global_pooling")) {
+      ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(in_x_dims[i + 2]);
+      }
+    }
+
+    PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
+                   "Input size and pooling size should be consistent.");
+    PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
+                      "Strides size and pooling size should be the same.");
+    PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
+                      "Paddings size and pooling size should be the same.");
+
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(OutputSizeMaxPool(in_x_dims[i + 2], ksize[i],
+                                               paddings[i], strides[i]));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+    ctx->SetOutputDim("Mask", framework::make_ddim(output_shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Mask"), "Input(Mask) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MaxPool2dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of pooling operator. "
+        "The format of input tensor is NCHW, where N is batch size, C is the "
+        "number of channels, H is the height of the image, "
+        "and W is the width of the image.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of pooling operator. "
+              "The format of output tensor is also NCHW, "
+              "where N is batch size, C is "
+              "the number of channels, H is the height of the image "
+              "and W is the width of the image.");
+    AddOutput("Mask",
+              "(Tensor) The Mask tensor of pooling operator."
+              "The format of output tensor is also NCHW, "
+              "where N is batch size, C is the number of channels, "
+              "H is the height of the image, "
+              "and W is the width of the image. "
+              "It represents the index in the current feature map.");
+
+    AddAttr<std::vector<int>>("ksize",
+                              "(vector<int>) The pooling window size(height, "
+                              "width) of pooling operator. "
+                              "If global_pooling = true, ksize and paddings "
+                              "will be ignored.");  // TODO(Chengduo): Add
+                                                    // checker. (Currently,
+    // TypedAttrChecker don't support vector type.)
+    AddAttr<bool>(
+        "global_pooling",
+        "(bool, default:false) Whether to use the global pooling. "
+        "If global_pooling = true, ksize and paddings will be ignored.")
+        .SetDefault(false);
+    AddAttr<std::vector<int>>("strides",
+                              "(vector<int>, default {1, 1}), strides(height, "
+                              "width) of pooling operator.")
+        .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
+    // TypedAttrChecker don't support vector type.)
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "(vector<int>, default:{0, 0}), paddings(height, width) of pooling "
+        "operator. "
+        "If global_pooling = true, paddings and will be ignored.")
+        .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
+    // TypedAttrChecker don't support vector type.)
+
+    AddComment(R"DOC(
+MaxPool2d Operator.
+
+The maxPooling2d with index operation calculates the output and the mask
+based on the input, ksize, strides, and paddings parameters. Input(X) and
+output(Out, Mask) are in NCHW format, where N is batch size, C is the
+number of channels, H is the height of the feature, 
+and W is the width of the feature.
+Parameters(ksize, strides, paddings) are two elements.
+These two elements represent height and width, respectively.
+The input(X) size and output(Out, Mask) size may be different.
+
+Example:
+  Input:
+       X shape: $(N, C, H_{in}, W_{in})$
+  Output:
+       Out shape: $(N, C, H_{out}, W_{out})$
+       Mask shape: $(N, C, H_{out}, W_{out})$
+  Where
+       $$
+       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
+       $$
+
+)DOC");
+  }
+};
+
+class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MaxPool3dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor) The input tensor of pooling operator. "
+             "The format of input tensor is NCDHW, where N is batch size, C is "
+             "the number of channels, and D, H and W are the depth, height and "
+             "width of "
+             "the image, respectively");
+    AddOutput("Out",
+              "(Tensor) The output tensor of pooling operator. "
+              "The format of output tensor is also NCDHW, "
+              "where N is the batch size, C is the number of channels, "
+              "and D, H and W are the depth, height and "
+              "width of the image, respectively.");
+    AddOutput("Mask",
+              "(Tensor) The Mask tensor of pooling operator. "
+              "The format of output tensor is also NCDHW, "
+              "where N is the batch size, C is the number of channels, and "
+              "D, H and W are the depth, height and width "
+              "of the image, respectively. "
+              "It represents the index in the current feature map.");
+
+    AddAttr<std::vector<int>>("ksize",
+                              "(vector<int>) The pooling window size(depth, "
+                              "height, width) of pooling operator. "
+                              "If global_pooling = true, ksize and paddings "
+                              "will be ignored.");  // TODO(Chengduo): Add
+                                                    // checker. (Currently,
+    // TypedAttrChecker don't support vector type.)
+    AddAttr<bool>(
+        "global_pooling",
+        "(bool, default false) Whether to use the global pooling. "
+        "If global_pooling = true, ksize and paddings will be ignored.")
+        .SetDefault(false);
+    AddAttr<std::vector<int>>("strides",
+                              "(vector<int>, default {1,1,1}), strides(depth, "
+                              "height, width) of pooling operator.")
+        .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
+    // TypedAttrChecker don't support vector type.)
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "(vector, default {0,0,0}), paddings(depth, "
+        "height, width) of pooling operator. "
+        "If global_pooling = true, paddings and ksize will be ignored.")
+        .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
+    // TypedAttrChecker don't support vector type.)
+
+    AddComment(R"DOC(
+MaxPool3d Operator.
+
+The maxpooling3d with index operation calculates the output and the mask
+based on the input and ksize, strides, paddings parameters.
+Input(X) and output(Out, Mask) are in NCDHW format, where N is batch
+size, C is the number of channels, and D, H and W are the depth, height and
+width of the feature, respectively. 
+Parameters(ksize, strides, paddings) are three elements.
+These three elements represent depth, height and width, respectively.
+The input(X) size and output(Out, Mask) size may be different.
+
+Example:
+  Input:
+       X shape: $(N, C, D_{in}, H_{in}, W_{in})$
+  Output:
+       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
+       Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$
+  Where
+       $$
+       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
+       $$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
+            ops::MaxPool2dWithIndexOpMaker, max_pool2d_with_index_grad,
+            ops::MaxPoolWithIndexOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    max_pool2d_with_index,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, double,
+                                int>);
+REGISTER_OP_CPU_KERNEL(
+    max_pool2d_with_index_grad,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
+                                    int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
+                                    int>)
+
+REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
+            ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad,
+            ops::MaxPoolWithIndexOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    max_pool3d_with_index,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, double,
+                                int>);
+REGISTER_OP_CPU_KERNEL(
+    max_pool3d_with_index_grad,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
+                                    int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
+                                    int>)
diff --git a/paddle/fluid/operators/pool_with_index_op.cu.cc b/paddle/fluid/operators/pool_with_index_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..722a4d1e2a4a4ad5c1268483db46e5e9d5d4a33b
--- /dev/null
+++ b/paddle/fluid/operators/pool_with_index_op.cu.cc
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/pool_with_index_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    max_pool2d_with_index,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, float,
+                                int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, double,
+                                int>);
+REGISTER_OP_CUDA_KERNEL(
+    max_pool2d_with_index_grad,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
+                                    int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
+                                    int>)
+
+REGISTER_OP_CUDA_KERNEL(
+    max_pool3d_with_index,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, float,
+                                int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, double,
+                                int>);
+REGISTER_OP_CUDA_KERNEL(
+    max_pool3d_with_index_grad,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
+                                    int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
+                                    int>)
diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..da7ef9df73a51aabc208521880168144de6f392c
--- /dev/null
+++ b/paddle/fluid/operators/pool_with_index_op.h
@@ -0,0 +1,110 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/pooling.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T1, typename T2>
+class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+    Tensor* mask = context.Output<Tensor>("Mask");
+
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    if (context.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+      }
+    }
+
+    switch (ksize.size()) {
+      case 2: {
+        paddle::operators::math::MaxPool2dWithIndexFunctor<DeviceContext, T1,
+                                                           T2>
+            pool2d_forward;
+        pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask);
+      } break;
+      case 3: {
+        paddle::operators::math::MaxPool3dWithIndexFunctor<DeviceContext, T1,
+                                                           T2>
+            pool3d_forward;
+        pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask);
+      } break;
+      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T1, typename T2>
+class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* mask = context.Input<Tensor>("Mask");
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    if (context.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(in_x_grad->dims()[i + 2]);
+      }
+    }
+
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T1>(context.GetPlace());
+      auto& device_ctx = context.template device_context<DeviceContext>();
+      math::set_constant(device_ctx, in_x_grad, 0);
+
+      switch (ksize.size()) {
+        case 2: {
+          paddle::operators::math::MaxPool2dWithIndexGradFunctor<DeviceContext,
+                                                                 T1, T2>
+              pool2d_backward;
+          pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides,
+                          paddings, in_x_grad);
+        } break;
+        case 3: {
+          paddle::operators::math::MaxPool3dWithIndexGradFunctor<DeviceContext,
+                                                                 T1, T2>
+              pool3d_backward;
+          pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
+                          paddings, in_x_grad);
+        } break;
+        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d237da25a00de13057e009b6705d3241b8b26539
--- /dev/null
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -0,0 +1,179 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/positive_negative_pair_op.h"
+
+namespace paddle {
+namespace operators {
+
+class PositiveNegativePairOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("Score"),
+        "Input(Score) of PositiveNegativePairOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Label"),
+        "Input(Label) of PositiveNegativePairOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("QueryID"),
+        "Input(QueryID) of PositiveNegativePairOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("PositivePair"),
+        "Output(PositivePair) of PositiveNegativePairOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("NegativePair"),
+        "Output(NegativePair) of PositiveNegativePairOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("NeutralPair"),
+        "Output(NeutralPair) of PositiveNegativePairOp should not be null.");
+    auto scalar_dim = framework::make_ddim({1});
+    if (ctx->HasInput("AccumulatePositivePair") ||
+        ctx->HasInput("AccumulateNegativePair") ||
+        ctx->HasInput("AccumulateNeutralPair")) {
+      PADDLE_ENFORCE(ctx->HasInput("AccumulatePositivePair") &&
+                         ctx->HasInput("AccumulateNegativePair") &&
+                         ctx->HasInput("AccumulateNeutralPair"),
+                     "All optional inputs(AccumulatePositivePair, "
+                     "AccumulateNegativePair, AccumulateNeutralPair) of "
+                     "PositiveNegativePairOp are required if one of them is "
+                     "specified.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulatePositivePair"), scalar_dim,
+                        "Shape of AccumulatePositivePair should be {1}.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulateNegativePair"), scalar_dim,
+                        "Shape of AccumulateNegativePair should be {1}.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulateNeutralPair"), scalar_dim,
+                        "Shape of AccumulateNeutralPair should be {1}.");
+    }
+
+    auto score_dim = ctx->GetInputDim("Score");
+    auto label_dim = ctx->GetInputDim("Label");
+    auto query_dim = ctx->GetInputDim("QueryID");
+    PADDLE_ENFORCE_EQ(score_dim.size(), 2, "Score should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(label_dim.size(), 2, "Label should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(
+        label_dim[0], score_dim[0],
+        "Tensor Score and Label should have the same height (batch size).");
+    PADDLE_ENFORCE_EQ(label_dim[1], 1,
+                      "The width of Label should be 1, i.e. each item should "
+                      "have a scalar label.");
+    PADDLE_ENFORCE(query_dim == label_dim,
+                   "QueryID should have the same shape as Label.");
+    if (ctx->HasInput("Weight")) {
+      PADDLE_ENFORCE(ctx->GetInputDim("Weight") == label_dim,
+                     "Weight should have the same shape as Label.");
+    }
+    int column = ctx->Attrs().Get<int>("column");
+    auto depth = score_dim[1];
+    PADDLE_ENFORCE(column < depth && column >= -depth,
+                   "Attribute column should be in the range of [-%l, %l)",
+                   depth, depth);
+
+    ctx->SetOutputDim("PositivePair", scalar_dim);
+    ctx->SetOutputDim("NegativePair", scalar_dim);
+    ctx->SetOutputDim("NeutralPair", scalar_dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Score")->type()),
+        ctx.device_context());
+  }
+};
+
+class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PositiveNegativePairOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Score",
+             "(Tensor, float) Model Score on an item (with "
+             "respect to QueryID). It's a 2-D tensor with shape [batch_size, "
+             "depth], where the column specified by the attribute \"column\" "
+             "is used as item score.");
+    AddInput("Label",
+             "(Tensor, float) Label of an item (with repsect to "
+             "QueryId). It's a 2-D tensor with shape [batch_size, 1].");
+    AddInput("QueryID",
+             "(Tensor, int64) Query ID that indicates the context. Its shape "
+             "should be the same as Label.");
+    AddInput(
+        "AccumulatePositivePair",
+        "(float) Optional. The accumulated number of positive pairs over a "
+        "stream of data. If provided, the output PositivePair will be "
+        "initialized with this number rather than 0. it won't be modified "
+        "in place.")
+        .AsDispensable();
+    AddInput(
+        "AccumulateNegativePair",
+        "(float) Optional. The accumulated number of negative pairs over a "
+        "stream of data. If provided, the output NegativePair will be "
+        "initialized with this number rather than 0. it won't be modified "
+        "in place.")
+        .AsDispensable();
+    AddInput("AccumulateNeutralPair",
+             "(float) Optional. The accumulated number of neutral pairs over a "
+             "stream of data. If provided, the output NeutralPair will be "
+             "initialized with this number rather than 0. it won't be modified "
+             "in place.")
+        .AsDispensable();
+    AddInput("Weight",
+             "(float) Optional. Weight of current item. If specified, its "
+             "shape should be the same as Label, and the meaning of the output "
+             "changes from numbers of pairs to the total sum of pairs' "
+             "weights. Weight of a pair of items is the average of their "
+             "weights.")
+        .AsDispensable();
+    AddOutput("PositivePair",
+              "(float) Number of positive pairs, i.e. the pairs of "
+              "items that are ranked correctly.");
+    AddOutput("NegativePair",
+              "(float) Number of negative pairs, i.e. the pairs of "
+              "items that are ranked incorrectly.");
+    AddOutput("NeutralPair",
+              "(float) Number of neutral pairs, i.e. the pairs of items "
+              "that have the same score.")
+        .AsDispensable();
+    AddAttr<int>(
+        "column",
+        "(int, default -1) The column position of Score used to rank items in "
+        "descending order. It must be in the range of [-rank(Score), "
+        "rank(Score)). "
+        "If `dim < 0`, the dim to reduce is `rank + dim`. "
+        "Noting that reducing on the first dim will make the LoD info lost.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+PositiveNegativePairOp can be used to evaluate Learning To Rank(LTR) model's
+performance.
+
+Within some context, e.g. the "query", a LTR model generates scores for a list
+of items, which gives a partial order of the items. PositiveNegativePairOp
+takes a list of reference rank order (Input("Label")) and the model generated
+scores (Input(Score)) as inputs and counts the pairs that ranked correctly
+and incorrectly.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(positive_negative_pair,
+                             ops::PositiveNegativePairOp,
+                             ops::PositiveNegativePairOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    positive_negative_pair,
+    ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, float>,
+    ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/positive_negative_pair_op.h b/paddle/fluid/operators/positive_negative_pair_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f20f33bbeb19766d6974ea17b155cac363c01fb2
--- /dev/null
+++ b/paddle/fluid/operators/positive_negative_pair_op.h
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class PositiveNegativePairKernel : public framework::OpKernel<T> {
+ public:
+  struct PredictionResult {
+    PredictionResult(T score, T label, T weight)
+        : score(score), label(label), weight(weight) {}
+    T score;
+    T label;
+    T weight;
+  };
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto score_t = context.Input<Tensor>("Score");
+    auto label_t = context.Input<Tensor>("Label");
+    auto query_t = context.Input<Tensor>("QueryID");
+    auto acc_positive_t = context.Input<Tensor>("AccumulatePositivePair");
+    auto acc_negative_t = context.Input<Tensor>("AccumulateNegativePair");
+    auto acc_neutral_t = context.Input<Tensor>("AccumulateNeutralPair");
+    auto positive_t = context.Output<Tensor>("PositivePair");
+    auto negative_t = context.Output<Tensor>("NegativePair");
+    auto neutral_t = context.Output<Tensor>("NeutralPair");
+    auto weight_t = context.Input<Tensor>("Weight");
+
+    auto score = score_t->data<T>();
+    auto label = label_t->data<T>();
+    auto query = query_t->data<int64_t>();
+    const T* weight = nullptr;
+    if (weight_t != nullptr) {
+      weight = weight_t->data<T>();
+    }
+    T* positive = positive_t->mutable_data<T>(context.GetPlace());
+    T* negative = negative_t->mutable_data<T>(context.GetPlace());
+    T* neutral = neutral_t->mutable_data<T>(context.GetPlace());
+
+    auto score_dim = score_t->dims();
+    auto batch_size = score_dim[0];
+    auto width = score_dim[1];
+    auto column = context.Attr<int32_t>("column");
+    if (column < 0) {
+      column += width;
+    }
+
+    // construct document instances for each query: Query => List[<score#0,
+    // label#0, weight#0>, ...]
+    std::unordered_map<int64_t, std::vector<PredictionResult>> predictions;
+    for (auto i = 0; i < batch_size; ++i) {
+      if (predictions.find(query[i]) == predictions.end()) {
+        predictions.emplace(
+            std::make_pair(query[i], std::vector<PredictionResult>()));
+      }
+      predictions[query[i]].emplace_back(score[i * width + column], label[i],
+                                         weight_t != nullptr ? weight[i] : 1.0);
+    }
+
+    // for each query, accumulate pair counts
+    T pos = 0, neg = 0, neu = 0;
+    if (acc_positive_t != nullptr && acc_negative_t != nullptr &&
+        acc_neutral_t != nullptr) {
+      pos = acc_positive_t->data<T>()[0];
+      neg = acc_negative_t->data<T>()[0];
+      neu = acc_neutral_t->data<T>()[0];
+    }
+    auto evaluate_one_list = [&pos, &neg,
+                              &neu](std::vector<PredictionResult> vec) {
+      for (auto ite1 = vec.begin(); ite1 != vec.end(); ++ite1) {
+        for (auto ite2 = ite1 + 1; ite2 != vec.end(); ++ite2) {
+          if (ite1->label == ite2->label) {  // labels are equal, ignore.
+            continue;
+          }
+          T w = (ite1->weight + ite2->weight) * 0.5;
+          if (ite1->score == ite2->score) {
+            neu += w;
+          }
+          (ite1->score - ite2->score) * (ite1->label - ite2->label) > 0.0
+              ? pos += w
+              : neg += w;
+        }
+      }
+    };
+    for (auto prediction : predictions) {
+      evaluate_one_list(prediction.second);
+    }
+    *positive = pos;
+    *negative = neg;
+    *neutral = neu;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/precision_recall_op.cc b/paddle/fluid/operators/precision_recall_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..30d594719c7274b90a88127028035a49c25e32e7
--- /dev/null
+++ b/paddle/fluid/operators/precision_recall_op.cc
@@ -0,0 +1,182 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/precision_recall_op.h"
+
+namespace paddle {
+namespace operators {
+
+class PrecisionRecallOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("MaxProbs"),
+                   "Input(MaxProbs) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Indices"),
+                   "Input(Indices) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchMetrics"),
+                   "Output(BatchMetrics) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("AccumMetrics"),
+                   "Output(AccumMetrics) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("AccumStatesInfo"),
+                   "Output(AccumStatesInfo) should not be null.");
+
+    int64_t cls_num =
+        static_cast<int64_t>(ctx->Attrs().Get<int>("class_number"));
+    auto max_probs_dims = ctx->GetInputDim("MaxProbs");
+    auto labels_dims = ctx->GetInputDim("Labels");
+
+    PADDLE_ENFORCE_EQ(max_probs_dims[1], 1,
+                      "Each instance contains one max probability, so the "
+                      "shape of Input(MaxProbs) should be [batch_size, 1].");
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Indices"), max_probs_dims,
+                      "The shape of Input(Indices) should be [batch_size, 1].");
+    PADDLE_ENFORCE_EQ(max_probs_dims[0], labels_dims[0],
+                      "The 1st dimension of Input(MaxProbs) and "
+                      "Input(Labels) both are batch_size and the shape should "
+                      "be the same.");
+    PADDLE_ENFORCE_EQ(labels_dims[1], 1,
+                      "The 2nd dimension of Input(Labels) contains instance "
+                      "label and the shape should be equal to 1.");
+    if (ctx->HasInput("Weights")) {
+      auto weights_dims = ctx->GetInputDim("Weights");
+      PADDLE_ENFORCE_EQ(weights_dims,
+                        framework::make_ddim({max_probs_dims[0], 1}),
+                        "The shape of Input(Weights) should be "
+                        "[batch_size, 1].");
+    }
+    if (ctx->HasInput("StatesInfo")) {
+      auto states_dims = ctx->GetInputDim("StatesInfo");
+      PADDLE_ENFORCE_EQ(states_dims, framework::make_ddim({cls_num, 4}),
+                        "The shape of Input(StatesInfo) should be "
+                        "[class_number, 4].");
+    }
+
+    // Layouts of BatchMetrics and AccumMetrics both are:
+    // [
+    //  macro average precision, macro average recall, macro average F1 score,
+    //  micro average precision, micro average recall, micro average F1 score
+    // ]
+    ctx->SetOutputDim("BatchMetrics", {6});
+    ctx->SetOutputDim("AccumMetrics", {6});
+    // Shape of AccumStatesInfo is [class_number, 4]
+    // The layout of each row is:
+    // [ TP, FP, TN, FN ]
+    ctx->SetOutputDim("AccumStatesInfo", {cls_num, 4});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("MaxProbs")->type()),
+        ctx.device_context());
+  }
+};
+
+class PrecisionRecallOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PrecisionRecallOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("MaxProbs",
+             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
+             "where N is the batch size. Each row contains the max probability "
+             "of an instance which computed by the previous top_k (k=1) "
+             "operator.");
+    AddInput("Indices",
+             "(Tensor, default Tensor<int>) A 2-D tensor with shape N x 1, "
+             "where N is the batch size. Each row contains the corresponding "
+             "index which computed by the previous top_k (k=1) operator.");
+    AddInput("Labels",
+             "(Tensor, default Tensor<int>) A 2-D tensor with shape N x 1, "
+             "where N is the batch size. Each element is a label and the "
+             "value should be in [0, class_number - 1].");
+    AddInput("Weights",
+             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
+             "where N is the batch size. This input is optional. If provided, "
+             "weight of instance would be considered when computing metrics.")
+        .AsDispensable();
+    AddInput("StatesInfo",
+             "(Tensor, default Tensor<int>) A 2-D tensor with shape D x 4, "
+             "where D is the number of classes. This input is optional. If "
+             "provided, current state will be accumulated to this state and "
+             "the accumulation state will be the output state.")
+        .AsDispensable();
+    AddOutput("BatchMetrics",
+              "(Tensor, default Tensor<float>) A 1-D tensor with shape {6}. "
+              "This output tensor contains metrics for current batch data. "
+              "The layout is [macro average precision, macro average recall, "
+              "macro f1 score, micro average precision, micro average recall, "
+              "micro f1 score].");
+    AddOutput("AccumMetrics",
+              "(Tensor, default Tensor<float>) A 1-D tensor with shape {6}. "
+              "This output tensor contains metrics for accumulated data. "
+              "The layout is [macro average precision, macro average recall, "
+              "macro f1 score, micro average precision, micro average recall, "
+              "micro f1 score].");
+    AddOutput("AccumStatesInfo",
+              "(Tensor, default Tensor<float>) A 2-D tensor with shape D x 4, "
+              "where D is equal to class number. This output tensor contains "
+              "accumulated state variables used to compute metrics. The layout "
+              "for each class is [true positives, false positives, "
+              "true negatives, false negatives].");
+    AddAttr<int>("class_number", "(int) Number of classes to be evaluated.");
+    AddComment(R"DOC(
+Precision Recall Operator.
+
+When given Input(Indices) and Input(Labels), this operator can be used
+to compute various metrics including:
+1. macro average precision
+2. macro average recall
+3. macro f1 score
+4. micro average precision
+5. micro average recall
+6. micro f1 score
+
+To compute the above metrics, we need to do statistics for true positives,
+false positives and false negatives. Here the count of true negatives is not
+necessary, but counting it may provide potential usage and the cost is
+trivial, so the operator also provides the count of true negatives.
+
+We define state as a 2-D tensor with shape [class_number, 4]. Each row of a
+state contains statistic variables for corresponding class. Layout of each row
+is: TP(true positives), FP(false positives), TN(true negatives),
+FN(false negatives). If Input(Weights) is provided, TP, FP, TN, FN will be
+calculated by given weight instead of the instance count.
+
+This operator also supports metrics computing for cross-batch situation. To
+achieve this, Input(StatesInfo) should be provided. State of current batch
+data will be accumulated to Input(StatesInfo) and Output(AccumStatesInfo)
+is the accumulation state.
+
+Output(BatchMetrics) is metrics of current batch data while
+Output(AccumStatesInfo) is metrics of accumulation data.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(precision_recall, ops::PrecisionRecallOp,
+                             ops::PrecisionRecallOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    precision_recall,
+    ops::PrecisionRecallKernel<paddle::platform::CPUPlace, float>,
+    ops::PrecisionRecallKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/precision_recall_op.h b/paddle/fluid/operators/precision_recall_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7dae86b76fc8d50e2ed5fe353920b68f7a846fb1
--- /dev/null
+++ b/paddle/fluid/operators/precision_recall_op.h
@@ -0,0 +1,161 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+enum StateVariable { TP = 0, FP, TN, FN };
+
+template <typename DeviceContext, typename T>
+class PrecisionRecallKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in0 = ctx.Input<Tensor>("Indices");
+    auto* in1 = ctx.Input<Tensor>("Labels");
+    auto* in2 = ctx.Input<Tensor>("Weights");
+    auto* in3 = ctx.Input<Tensor>("StatesInfo");
+    auto* out0 = ctx.Output<Tensor>("BatchMetrics");
+    auto* out1 = ctx.Output<Tensor>("AccumMetrics");
+    auto* out2 = ctx.Output<Tensor>("AccumStatesInfo");
+
+    const int* ids_data = in0->data<int>();
+    const int* labels_data = in1->data<int>();
+    size_t cls_num = static_cast<size_t>(ctx.Attr<int>("class_number"));
+    const T* weights_data = in2 ? in2->data<T>() : nullptr;
+    const T* states_data = in3 ? in3->data<T>() : nullptr;
+    double* batch_metrics_data = out0->mutable_data<double>(ctx.GetPlace());
+    double* accum_metrics_data = out1->mutable_data<double>(ctx.GetPlace());
+    out2->mutable_data<T>(ctx.GetPlace());
+    auto accum_states = EigenMatrix<T>::From(*out2);
+    accum_states.setZero();
+    T* accum_states_data = out2->data<T>();
+
+    size_t sample_num = in0->dims()[0];
+    size_t state_var_num = 4;  // TP FP TN FN
+
+    // get states info for current batch
+    for (size_t i = 0; i < sample_num; ++i) {
+      size_t idx = ids_data[i];
+      size_t label = labels_data[i];
+
+      PADDLE_ENFORCE(idx >= 0 && idx < cls_num,
+                     "Class index of each instance should be in "
+                     "[0, class_number).");
+      PADDLE_ENFORCE(label >= 0 && label < cls_num,
+                     "Label of each instance should be in [0, class_number).");
+
+      T w = weights_data ? weights_data[i] : 1.0;
+      if (idx == label) {
+        accum_states_data[idx * state_var_num + TP] += w;
+        for (size_t j = 0; j < cls_num; ++j) {
+          accum_states_data[j * state_var_num + TN] += w;
+        }
+        accum_states_data[idx * state_var_num + TN] -= w;
+      } else {
+        accum_states_data[label * state_var_num + FN] += w;
+        accum_states_data[idx * state_var_num + FP] += w;
+        for (size_t j = 0; j < cls_num; ++j) {
+          accum_states_data[j * state_var_num + TN] += w;
+        }
+        accum_states_data[idx * state_var_num + TN] -= w;
+        accum_states_data[label * state_var_num + TN] -= w;
+      }
+    }
+
+    ComputeMetrics(accum_states_data, batch_metrics_data, state_var_num,
+                   cls_num);
+
+    if (states_data) {
+      for (size_t i = 0; i < cls_num; ++i) {
+        for (size_t j = 0; j < state_var_num; ++j) {
+          size_t idx = i * state_var_num + j;
+          accum_states_data[idx] += states_data[idx];
+        }
+      }
+    }
+
+    ComputeMetrics(accum_states_data, accum_metrics_data, state_var_num,
+                   cls_num);
+  }
+
+  // expose to be reused
+  static inline T CalcPrecision(T tp_count, T fp_count) {
+    if (tp_count > 0.0 || fp_count > 0.0) {
+      return tp_count / (tp_count + fp_count);
+    }
+    return 1.0;
+  }
+
+  static inline T CalcRecall(T tp_count, T fn_count) {
+    if (tp_count > 0.0 || fn_count > 0.0) {
+      return tp_count / (tp_count + fn_count);
+    }
+    return 1.0;
+  }
+
+  static inline T CalcF1Score(T precision, T recall) {
+    if (precision > 0.0 || recall > 0.0) {
+      return 2 * precision * recall / (precision + recall);
+    }
+    return 0.0;
+  }
+
+ protected:
+  void ComputeMetrics(const T* states_data, double* metrics_data,
+                      size_t state_var_num, size_t cls_num) const {
+    T total_tp_count = 0;
+    T total_fp_count = 0;
+    T total_fn_count = 0;
+    T macro_avg_precision = 0.0;
+    T macro_avg_recall = 0.0;
+
+    for (size_t i = 0; i < cls_num; ++i) {
+      T tp_count = states_data[i * state_var_num + TP];
+      T fp_count = states_data[i * state_var_num + FP];
+      T fn_count = states_data[i * state_var_num + FN];
+      total_tp_count += tp_count;
+      total_fp_count += fp_count;
+      total_fn_count += fn_count;
+      macro_avg_precision += CalcPrecision(tp_count, fp_count);
+      macro_avg_recall += CalcRecall(tp_count, fn_count);
+    }
+    macro_avg_precision /= cls_num;
+    macro_avg_recall /= cls_num;
+    T macro_f1_score = CalcF1Score(macro_avg_precision, macro_avg_recall);
+
+    T micro_avg_precision = CalcPrecision(total_tp_count, total_fp_count);
+    T micro_avg_recall = CalcRecall(total_tp_count, total_fn_count);
+    T micro_f1_score = CalcF1Score(micro_avg_precision, micro_avg_recall);
+
+    // fill metrics data
+    metrics_data[0] = macro_avg_precision;
+    metrics_data[1] = macro_avg_recall;
+    metrics_data[2] = macro_f1_score;
+    metrics_data[3] = micro_avg_precision;
+    metrics_data[4] = micro_avg_recall;
+    metrics_data[5] = micro_f1_score;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22b970d971221e86a25e0b72f3d3704e5cee5d7f
--- /dev/null
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -0,0 +1,92 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/prelu_op.h"
+#include "paddle/fluid/operators/net_op.h"
+
+namespace paddle {
+namespace operators {
+
+class PReluOp : public framework::OperatorWithKernel {
+ public:
+  PReluOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Alpha"), "Input(Alpha) should not be null");
+    PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1,
+                   "Size of weight Alpha must be one.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensor of prelu operator.");
+    AddInput("Alpha", "The alpha weight of prelu operator.");
+    AddOutput("Out", "The output tensor of prelu operator.");
+    AddComment(R"DOC(
+PRelu Operator.
+
+The equation is:
+
+$$
+f(x) =
+\begin{cases}
+\alpha * x, \quad  \text{if} \ x < 0 \\
+x,         \qquad  \text{if} \ x >= 0
+\end{cases}
+$$
+
+The input `X` can carry the LoD (Level of Details) information,
+or not. And the output shares the LoD information with input `X`.
+
+)DOC");
+  }
+};
+
+// The operator to calculate gradients of a prelu operator.
+class PReluGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->SetOutputDim(framework::GradVarName("Alpha"),
+                      ctx->GetInputDim("Alpha"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(prelu, ops::PReluOp, ops::PReluOpMaker, prelu_grad,
+            ops::PReluGradOp);
+REGISTER_OP_CPU_KERNEL(
+    prelu, ops::PReluKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    prelu_grad,
+    ops::PReluGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..038b09a493c5064d5419260b2fbfdf56b6bb5982
--- /dev/null
+++ b/paddle/fluid/operators/prelu_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/prelu_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    prelu,
+    paddle::operators::PReluKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(prelu_grad,
+                        paddle::operators::PReluGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/prelu_op.h b/paddle/fluid/operators/prelu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..85ad75d479001ec5dad1b796d4932c7e6c4ab7af
--- /dev/null
+++ b/paddle/fluid/operators/prelu_op.h
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using platform::Transform;
+
+template <typename T>
+class PReluFunctor {
+ public:
+  explicit PReluFunctor(const T* alpha) : alpha_(alpha) {}
+
+  HOSTDEVICE T operator()(const T& x) const {
+    if (x > 0)
+      return x;
+    else
+      return x * (*alpha_);
+  }
+
+ private:
+  const T* alpha_;
+};
+
+template <typename DeviceContext, typename T>
+class PReluKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* alpha = context.Input<Tensor>("Alpha");
+    auto* out = context.Output<Tensor>("Out");
+
+    const T* x_ptr = x->data<T>();
+    T* o_ptr = out->mutable_data<T>(context.GetPlace());
+
+    auto* alpha_ptr = alpha->data<T>();
+
+    int numel = x->numel();
+
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x_ptr,
+          x_ptr + numel, o_ptr, PReluFunctor<T>(alpha_ptr));
+  }
+};
+
+template <typename T>
+class PReluGradFunctor {
+ public:
+  explicit PReluGradFunctor(const T* alpha) : alpha_(alpha) {}
+
+  HOSTDEVICE T operator()(const T& out, const T& dout) const {
+    if (out > 0)
+      return dout;
+    else
+      return dout * (*alpha_);
+  }
+
+ private:
+  const T* alpha_;
+};
+
+template <typename DeviceContext, typename T>
+class PReluGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto* out = context.Input<Tensor>("Out");
+    auto* alpha = context.Input<Tensor>("Alpha");
+    auto* alpha_ptr = alpha->data<T>();
+
+    T* dx_ptr = dx->mutable_data<T>(context.GetPlace());
+    const T* dout_ptr = dout->data<T>();
+    const T* out_ptr = out->data<T>();
+    int numel = dx->numel();
+
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), out_ptr,
+          out_ptr + numel, dout_ptr, dx_ptr, PReluGradFunctor<T>(alpha_ptr));
+
+    // TODO(Zhuoyuan): add dalpha upgrade when GPU kernels ready
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3616545309e8c279f61a22e571a5e71335c47f93
--- /dev/null
+++ b/paddle/fluid/operators/print_op.cc
@@ -0,0 +1,283 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <algorithm>
+#include <ctime>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace operators {
+
+#define CLOG std::cout
+
+const std::string kForward = "FORWARD";
+const std::string kBackward = "BACKWARD";
+const std::string kBoth = "BOTH";
+
+struct Formater {
+  std::string message;
+  std::string name;
+  std::vector<int> dims;
+  std::type_index dtype{typeid(char)};
+  framework::LoD lod;
+  int summarize;
+  void* data{nullptr};
+
+  void operator()(size_t size) {
+    PrintMessage();
+    PrintName();
+    PrintDims();
+    PrintDtype();
+    PrintLod();
+    PrintData(size);
+  }
+
+ private:
+  void PrintMessage() { CLOG << std::time(nullptr) << "\t" << message; }
+  void PrintName() {
+    if (!name.empty()) {
+      CLOG << "Tensor[" << name << "]" << std::endl;
+    }
+  }
+  void PrintDims() {
+    if (!dims.empty()) {
+      CLOG << "\tshape: [";
+      for (auto i : dims) {
+        CLOG << i << ",";
+      }
+      CLOG << "]" << std::endl;
+    }
+  }
+  void PrintDtype() {
+    if (dtype.hash_code() != typeid(char).hash_code()) {
+      CLOG << "\tdtype: " << dtype.name() << std::endl;
+    }
+  }
+  void PrintLod() {
+    if (!lod.empty()) {
+      CLOG << "\tLoD: [";
+      for (auto level : lod) {
+        CLOG << "[ ";
+        for (auto i : level) {
+          CLOG << i << ",";
+        }
+        CLOG << " ]";
+      }
+      CLOG << "]" << std::endl;
+    }
+  }
+
+  void PrintData(size_t size) {
+    PADDLE_ENFORCE_NOT_NULL(data);
+    // print float
+    if (dtype.hash_code() == typeid(float).hash_code()) {
+      Display<float>(size);
+    }
+    if (dtype.hash_code() == typeid(double).hash_code()) {
+      Display<double>(size);
+    }
+    if (dtype.hash_code() == typeid(int).hash_code()) {
+      Display<int>(size);
+    }
+    if (dtype.hash_code() == typeid(int64_t).hash_code()) {
+      Display<int64_t>(size);
+    }
+  }
+
+  template <typename T>
+  void Display(size_t size) {
+    auto* d = (T*)data;
+    CLOG << "\tdata: ";
+    if (summarize != -1) {
+      summarize = std::min(size, (size_t)summarize);
+      for (int i = 0; i < summarize; i++) {
+        CLOG << d[i] << ",";
+      }
+    } else {
+      for (size_t i = 0; i < size; i++) {
+        CLOG << d[i] << ",";
+      }
+    }
+    CLOG << std::endl;
+  }
+};
+
+// TODO(ChunweiYan) there should be some other printers for TensorArray
+class TensorPrintOp : public framework::OperatorBase {
+ public:
+  TensorPrintOp(const std::string& type,
+                const framework::VariableNameMap& inputs,
+                const framework::VariableNameMap& outputs,
+                const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  TensorPrintOp(const TensorPrintOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    PADDLE_THROW("Not implemented.");
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::Place& place) const override {
+    const framework::Variable* in_var_ptr = nullptr;
+    std::string phase = kForward;
+    std::string printed_var_name = "";
+
+    auto& inputs = Inputs();
+    if (inputs.find("In") != inputs.end() && !Inputs("In").empty()) {
+      in_var_ptr = scope.FindVar(Input("In"));
+      printed_var_name = Inputs("In").front();
+    } else if (inputs.find("In@GRAD") != inputs.end() &&
+               !Inputs("In@GRAD").empty()) {
+      in_var_ptr = scope.FindVar(Input("In@GRAD"));
+      printed_var_name = Inputs("In@GRAD").front();
+      phase = kBackward;
+    } else {
+      PADDLE_THROW("Unknown phase, should be forward or backward.");
+    }
+
+    PADDLE_ENFORCE_NOT_NULL(in_var_ptr);
+
+    auto& in_tensor = in_var_ptr->Get<framework::LoDTensor>();
+    auto* out_var_ptr = scope.FindVar(Output("Out"));
+    auto& out_tensor = *out_var_ptr->GetMutable<framework::LoDTensor>();
+
+    // Just copy data from input tensor to output tensor
+    // output tensor share same memory with input tensor
+    out_tensor.ShareDataWith(in_tensor);
+    out_tensor.set_lod(in_tensor.lod());
+
+    std::string print_phase = Attr<std::string>("print_phase");
+    if (print_phase != phase && print_phase != kBoth) {
+      return;
+    }
+
+    int first_n = Attr<int>("first_n");
+    if (first_n > 0 && ++times_ > first_n) return;
+
+    framework::LoDTensor printed_tensor;
+    printed_tensor.set_lod(in_tensor.lod());
+    printed_tensor.Resize(in_tensor.dims());
+
+    if (platform::is_cpu_place(in_tensor.place())) {
+      printed_tensor.ShareDataWith(in_tensor);
+    } else {
+      // copy data to cpu to print
+      platform::CPUPlace place;
+      framework::Copy(in_tensor, place, &printed_tensor);
+    }
+
+    Formater formater;
+    if (Attr<bool>("print_tensor_name")) {
+      formater.name = printed_var_name;
+    }
+    if (Attr<bool>("print_tensor_type")) {
+      formater.dtype = printed_tensor.type();
+    }
+    if (Attr<bool>("print_tensor_shape")) {
+      auto& dims = printed_tensor.dims();
+      formater.dims.resize(dims.size());
+      for (int i = 0; i < dims.size(); ++i) formater.dims[i] = dims[i];
+    }
+    if (Attr<bool>("print_tensor_lod")) {
+      formater.lod = printed_tensor.lod();
+    }
+    formater.summarize = Attr<int>("summarize");
+    formater.data = (void*)printed_tensor.data<void>();
+    formater(printed_tensor.numel());
+  }
+
+ private:
+  mutable int times_{0};
+};
+
+class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PrintOpProtoAndCheckMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("In", "Input tensor to be displayed.");
+    AddAttr<int>("first_n", "Only log `first_n` number of times.");
+    AddAttr<std::string>("message", "A string message to print as a prefix.");
+    AddAttr<int>("summarize", "Number of elements printed.");
+    AddAttr<bool>("print_tensor_name", "Whether to print the tensor name.");
+    AddAttr<bool>("print_tensor_type", "Whether to print the tensor's dtype.");
+    AddAttr<bool>("print_tensor_shape", "Whether to print the tensor's shape.");
+    AddAttr<bool>("print_tensor_lod", "Whether to print the tensor's lod.");
+    AddAttr<std::string>(
+        "print_phase",
+        "(string, default 'BOTH') Which phase to display including 'FORWARD' "
+        "'BACKWARD' and 'BOTH'.")
+        .SetDefault(kBoth)
+        .InEnum({kForward, kBackward, kBoth});
+    AddOutput("Out", "Output tensor with same data as input tensor.");
+    AddComment(R"DOC(
+Creates a print op that will print when a tensor is accessed.
+
+Wraps the tensor passed in so that whenever that a tensor is accessed,
+the message `message` is printed, along with the current value of the
+tensor `t`.)DOC");
+  }
+};
+
+class InferShapeForward : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* context) const override {
+    PADDLE_ENFORCE(context->HasInput("In"), "Input(In) should not be null.");
+    context->ShareLoD("In", /*->*/ "Out");
+    context->SetOutputDim("Out", context->GetInputDim("In"));
+  }
+};
+
+class InferShapeBackward : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* context) const override {
+    PADDLE_ENFORCE(context->HasInput("In@GRAD"),
+                   "Input(In@GRAD) should not be null.");
+    context->ShareLoD("In@GRAD", /*->*/ "Out");
+    context->SetOutputDim("Out", context->GetInputDim("In@GRAD"));
+  }
+};
+
+class InferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {}
+};
+
+class PrintOpProtoAndCheckGradOpMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType("print_grad");
+    op_desc_ptr->SetInput("In@GRAD", OutputGrad("Out"));
+    op_desc_ptr->SetOutput("Out", InputGrad("In"));
+    op_desc_ptr->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(print, ops::TensorPrintOp, ops::PrintOpProtoAndCheckMaker,
+                  ops::PrintOpProtoAndCheckGradOpMaker, ops::InferShapeForward,
+                  ops::InferVarType);
+REGISTER_OPERATOR(print_grad, ops::TensorPrintOp, ops::InferShapeBackward);
diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ed48603e17f38f89705186fb9fb992f69d26d2ff
--- /dev/null
+++ b/paddle/fluid/operators/prior_box_op.cc
@@ -0,0 +1,165 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/prior_box_op.h"
+
+namespace paddle {
+namespace operators {
+
+class PriorBoxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of PriorBoxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Image"),
+                   "Input(Image) of PriorBoxOp should not be null.");
+
+    auto image_dims = ctx->GetInputDim("Image");
+    auto input_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE(image_dims.size() == 4, "The layout of image is NCHW.");
+    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+
+    PADDLE_ENFORCE_LT(input_dims[2], image_dims[2],
+                      "The height of input must smaller than image.");
+
+    PADDLE_ENFORCE_LT(input_dims[3], image_dims[3],
+                      "The width of input must smaller than image.");
+
+    auto min_sizes = ctx->Attrs().Get<std::vector<int>>("min_sizes");
+    auto max_sizes = ctx->Attrs().Get<std::vector<int>>("max_sizes");
+    auto variances = ctx->Attrs().Get<std::vector<float>>("variances");
+    auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
+    bool flip = ctx->Attrs().Get<bool>("flip");
+
+    std::vector<float> aspect_ratios_vec;
+    ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec);
+
+    int num_priors = aspect_ratios_vec.size() * min_sizes.size();
+    if (max_sizes.size() > 0) {
+      PADDLE_ENFORCE_EQ(max_sizes.size(), min_sizes.size(),
+                        "The number of min_size and max_size must be equal.");
+      for (size_t i = 0; i < min_sizes.size(); ++i) {
+        PADDLE_ENFORCE_GT(max_sizes[i], min_sizes[i],
+                          "max_size[%d] must be greater than min_size[%d].", i,
+                          i);
+        num_priors += 1;
+      }
+    }
+
+    std::vector<int64_t> dim_vec(4);
+    dim_vec[0] = input_dims[2];
+    dim_vec[1] = input_dims[3];
+    dim_vec[2] = num_priors;
+    dim_vec[3] = 4;
+    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
+    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
+  }
+};
+
+class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PriorBoxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(Tensor, default Tensor<float>), "
+             "the input feature data of PriorBoxOp, The layout is NCHW.");
+    AddInput("Image",
+             "(Tensor, default Tensor<float>), "
+             "the input image data of PriorBoxOp, The layout is NCHW.");
+    AddOutput("Boxes",
+              "(Tensor, default Tensor<float>), the output prior boxes of "
+              "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
+              "H is the height of input, W is the width of input, num_priors "
+              "is the box count of each position.");
+    AddOutput("Variances",
+              "(Tensor, default Tensor<float>), the expanded variances of "
+              "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
+              "H is the height of input, W is the width of input, num_priors "
+              "is the box count of each position.");
+
+    AddAttr<std::vector<int>>("min_sizes",
+                              "(vector<int>) List of min sizes "
+                              "of generated prior boxes.")
+        .AddCustomChecker([](const std::vector<int>& min_sizes) {
+          PADDLE_ENFORCE_GT(min_sizes.size(), 0,
+                            "Size of min_sizes must be at least 1.");
+          for (size_t i = 0; i < min_sizes.size(); ++i) {
+            PADDLE_ENFORCE_GT(min_sizes[i], 0,
+                              "min_sizes[%d] must be positive.", i);
+          }
+        });
+    AddAttr<std::vector<int>>(
+        "max_sizes",
+        "(vector<int>) List of max sizes of generated prior boxes.");
+    AddAttr<std::vector<float>>(
+        "aspect_ratios",
+        "(vector<float>) List of aspect ratios of generated prior boxes.");
+
+    AddAttr<std::vector<float>>(
+        "variances",
+        "(vector<float>) List of variances to be encoded in prior boxes.")
+        .AddCustomChecker([](const std::vector<float>& variances) {
+          PADDLE_ENFORCE_EQ(variances.size(), 4,
+                            "Must and only provide 4 variance.");
+          for (size_t i = 0; i < variances.size(); ++i) {
+            PADDLE_ENFORCE_GT(variances[i], 0.0,
+                              "variance[%d] must be greater than 0.", i);
+          }
+        });
+    AddAttr<bool>("flip", "(bool) Whether to flip aspect ratios.")
+        .SetDefault(true);
+    AddAttr<bool>("clip", "(bool) Whether to clip out-of-boundary boxes.")
+        .SetDefault(true);
+
+    AddAttr<float>("step_w",
+                   "Prior boxes step across width, 0 for auto calculation.")
+        .SetDefault(0.0)
+        .AddCustomChecker([](const float& step_w) {
+          PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0.");
+        });
+    AddAttr<float>("step_h",
+                   "Prior boxes step across height, 0 for auto calculation.")
+        .SetDefault(0.0)
+        .AddCustomChecker([](const float& step_h) {
+          PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0.");
+        });
+
+    AddAttr<float>("offset",
+                   "(float) "
+                   "Prior boxes center offset.")
+        .SetDefault(0.5);
+    AddComment(R"DOC(
+Prior box operator
+Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
+Each position of the input produce N prior boxes, N is determined by
+ the count of min_sizes, max_sizes and aspect_ratios, The size of the
+ box is in range(min_size, max_size) interval, which is generated in
+ sequence according to the aspect_ratios.
+
+Please get more information from the following papers:
+https://arxiv.org/abs/1512.02325.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    prior_box, ops::PriorBoxOpKernel<paddle::platform::CPUPlace, float>,
+    ops::PriorBoxOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/prior_box_op.h b/paddle/fluid/operators/prior_box_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd07041233495660605e9cf9acb33d57eb57bc30
--- /dev/null
+++ b/paddle/fluid/operators/prior_box_op.h
@@ -0,0 +1,201 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
+                               bool flip,
+                               std::vector<float>& output_aspect_ratior) {
+  constexpr float epsilon = 1e-6;
+  output_aspect_ratior.clear();
+  output_aspect_ratior.push_back(1.0f);
+  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
+    float ar = input_aspect_ratior[i];
+    bool already_exist = false;
+    for (size_t j = 0; j < output_aspect_ratior.size(); ++j) {
+      if (fabs(ar - output_aspect_ratior[j]) < epsilon) {
+        already_exist = true;
+        break;
+      }
+    }
+    if (!already_exist) {
+      output_aspect_ratior.push_back(ar);
+      if (flip) {
+        output_aspect_ratior.push_back(1.0f / ar);
+      }
+    }
+  }
+}
+
+template <typename T>
+struct ClipFunctor {
+  HOSTDEVICE inline T operator()(T in) const {
+    return std::min<T>(std::max<T>(in, 0.), 1.);
+  }
+};
+
+template <typename Place, typename T>
+class PriorBoxOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
+    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto min_sizes = ctx.Attr<std::vector<int>>("min_sizes");
+    auto max_sizes = ctx.Attr<std::vector<int>>("max_sizes");
+    auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+    auto flip = ctx.Attr<bool>("flip");
+    auto clip = ctx.Attr<bool>("clip");
+
+    std::vector<float> aspect_ratios;
+    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
+
+    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
+    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto img_width = image->dims()[3];
+    auto img_height = image->dims()[2];
+
+    auto feature_width = input->dims()[3];
+    auto feature_height = input->dims()[2];
+
+    T step_width, step_height;
+    if (step_w == 0 || step_h == 0) {
+      step_width = static_cast<T>(img_width) / feature_width;
+      step_height = static_cast<T>(img_height) / feature_height;
+    } else {
+      step_width = step_w;
+      step_height = step_h;
+    }
+
+    int num_priors = aspect_ratios.size() * min_sizes.size();
+    if (max_sizes.size() > 0) {
+      num_priors += max_sizes.size();
+    }
+
+    boxes->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+
+    T inv_img_width = 1.0 / img_width;
+    T inv_img_height = 1.0 / img_height;
+
+    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
+    for (int h = 0; h < feature_height; ++h) {
+      for (int w = 0; w < feature_width; ++w) {
+        T center_x = (w + offset) * step_width;
+        T center_y = (h + offset) * step_height;
+        T box_width, box_height;
+        int idx = 0;
+        for (size_t s = 0; s < min_sizes.size(); ++s) {
+          int min_size = min_sizes[s];
+          // first prior: aspect_ratio = 1, size = min_size
+          box_width = box_height = min_size;
+          // xmin
+          e_boxes(h, w, idx, 0) = (center_x - box_width * 0.5) * inv_img_width;
+          // ymin
+          e_boxes(h, w, idx, 1) =
+              (center_y - box_height * 0.5) * inv_img_height;
+          // xmax
+          e_boxes(h, w, idx, 2) = (center_x + box_width * 0.5) * inv_img_width;
+          // ymax
+          e_boxes(h, w, idx, 3) =
+              (center_y + box_height * 0.5) * inv_img_height;
+
+          idx++;
+          if (max_sizes.size() > 0) {
+            int max_size = max_sizes[s];
+            // second prior: aspect_ratio = 1,
+            // size = sqrt(min_size * max_size)
+            box_width = box_height = sqrt(min_size * max_size);
+            // xmin
+            e_boxes(h, w, idx, 0) =
+                (center_x - box_width * 0.5) * inv_img_width;
+            // ymin
+            e_boxes(h, w, idx, 1) =
+                (center_y - box_height * 0.5) * inv_img_height;
+            // xmax
+            e_boxes(h, w, idx, 2) =
+                (center_x + box_width * 0.5) * inv_img_width;
+            // ymax
+            e_boxes(h, w, idx, 3) =
+                (center_y + box_height * 0.5) * inv_img_height;
+            idx++;
+          }
+
+          // rest of priors
+          for (size_t r = 0; r < aspect_ratios.size(); ++r) {
+            float ar = aspect_ratios[r];
+            if (fabs(ar - 1.) < 1e-6) {
+              continue;
+            }
+            box_width = min_size * sqrt(ar);
+            box_height = min_size / sqrt(ar);
+            // xmin
+            e_boxes(h, w, idx, 0) =
+                (center_x - box_width * 0.5) * inv_img_width;
+            // ymin
+            e_boxes(h, w, idx, 1) =
+                (center_y - box_height * 0.5) * inv_img_height;
+            // xmax
+            e_boxes(h, w, idx, 2) =
+                (center_x + box_width * 0.5) * inv_img_width;
+            // ymax
+            e_boxes(h, w, idx, 3) =
+                (center_y + box_height * 0.5) * inv_img_height;
+            idx++;
+          }
+        }
+      }
+    }
+
+    if (clip) {
+      platform::Transform<platform::CPUDeviceContext> trans;
+      ClipFunctor<T> clip_func;
+      trans(ctx.template device_context<platform::CPUDeviceContext>(),
+            boxes->data<T>(), boxes->data<T>() + boxes->numel(),
+            boxes->data<T>(), clip_func);
+    }
+
+    framework::Tensor var_t;
+    var_t.mutable_data<T>(
+        framework::make_ddim({1, static_cast<int>(variances.size())}),
+        ctx.GetPlace());
+    auto var_et = framework::EigenTensor<T, 2>::From(var_t);
+    for (size_t i = 0; i < variances.size(); ++i) {
+      var_et(0, i) = variances[i];
+    }
+
+    int box_num = feature_height * feature_width * num_priors;
+    auto var_dim = vars->dims();
+    vars->Resize({box_num, static_cast<int>(variances.size())});
+
+    auto e_vars = framework::EigenMatrix<T, Eigen::RowMajor>::From(*vars);
+    e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(box_num, 1));
+
+    vars->Resize(var_dim);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/proximal_adagrad_op.cc b/paddle/fluid/operators/proximal_adagrad_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9e3894c576a94c094c0f4b72e3b6519c4ec26e1
--- /dev/null
+++ b/paddle/fluid/operators/proximal_adagrad_op.cc
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/proximal_adagrad_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ProximalAdagradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of ProximalAdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment"),
+                   "Input(Moment) of ProximalAdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of ProximalAdagradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("LearningRate"),
+        "Input(LearningRate) of ProximalAdagradOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of ProximalAdagradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("MomentOut"),
+        "Output(MomentOut) of ProximalAdagradOp should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("Grad"),
+        "Param and Grad of ProximalAdagrad Op must have same dimension.");
+
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("Moment"),
+        "Param and Moment of ProximalAdagrad Op must have same dimension.");
+
+    auto lr_dim = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
+                      "Learning Rate should be a scalar.");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("MomentOut", param_dim);
+  }
+};
+
+class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ProximalAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param",
+             "(Tensor, default Tensor<float>) "
+             "Input parameter that has to be updated.");
+    AddInput("Moment",
+             "(Tensor, default Tensor<float>) "
+             "Moment parameter that has to be updated.");
+    AddInput("Grad",
+             "(Tensor, default Tensor<float>) "
+             "Input gradient of the parameter.");
+    AddInput("LearningRate",
+             "(Tensor, default Tensor<float>) "
+             "The learning rate should be a tensor of size 1.");
+
+    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
+    AddOutput("MomentOut", "(Tensor) Output updated moment value.");
+
+    AddAttr<float>("l1",
+                   "(float, default 0.0) "
+                   "L1 regularization strength.")
+        .SetDefault(0.0f);
+    AddAttr<float>("l2",
+                   "(float, default 0.0) "
+                   "L2 regularization strength.")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+Proximal Adagrad Optimizer.
+
+Optimizer that implements the proximal adagrad algorithm:
+
+$$
+moment = moment + grad * grad \\
+prox\_param = param - learning\_rate * grad * (1 / \sqrt{moment}) \\
+param = sign(prox\_param) / (1 + learning\_rate * l2) *
+        \max(|prox\_param| - learning\_rate * l1 , 0)
+$$
+
+The paper that proposed Proximal GD: 
+(http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
+Here, we use the adagrad learning rate as specified here: 
+(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(proximal_adagrad, ops::ProximalAdagradOp,
+                             ops::ProximalAdagradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    proximal_adagrad,
+    ops::ProximalAdagradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/proximal_adagrad_op.cu b/paddle/fluid/operators/proximal_adagrad_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..54c75b3abb8e84b2aa55d044e79423cf86523d76
--- /dev/null
+++ b/paddle/fluid/operators/proximal_adagrad_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+You may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/proximal_adagrad_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    proximal_adagrad,
+    ops::ProximalAdagradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/proximal_adagrad_op.h b/paddle/fluid/operators/proximal_adagrad_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..70205a8d11f757d08150a81d1369133778ad996c
--- /dev/null
+++ b/paddle/fluid/operators/proximal_adagrad_op.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class ProximalAdagradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param_out = ctx.Output<Tensor>("ParamOut");
+    auto* moment_out = ctx.Output<Tensor>("MomentOut");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+    moment_out->mutable_data<T>(ctx.GetPlace());
+
+    auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
+    auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
+
+    auto grad = ctx.Input<Tensor>("Grad");
+    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
+    auto m = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment"));
+    auto g = EigenVector<T>::Flatten(*grad);
+    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
+
+    auto p_out = EigenVector<T>::Flatten(*param_out);
+    auto m_out = EigenVector<T>::Flatten(*moment_out);
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
+
+    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+
+    m_out.device(*place) = m + g * g;
+    auto prox_param = p - lr.broadcast(grad_dsize) * g / m_out.sqrt();
+    if (l1 > static_cast<T>(0)) {
+      p_out.device(*place) =
+          prox_param.sign() *
+          (((prox_param.abs() - (lr * l1).broadcast(grad_dsize))
+                .cwiseMax(static_cast<T>(0.0))) /
+           (static_cast<T>(1.0) + (lr * l2).broadcast(grad_dsize)));
+    } else {
+      p_out.device(*place) =
+          prox_param / (static_cast<T>(1.0) + (lr * l2).broadcast(grad_dsize));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/proximal_gd_op.cc b/paddle/fluid/operators/proximal_gd_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de7c6843c8ba7c1cc9229a11707de7a1400deee1
--- /dev/null
+++ b/paddle/fluid/operators/proximal_gd_op.cc
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/proximal_gd_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ProximalGDOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of ProximalGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of ProximalGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of ProximalGDOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of ProximalGDOp should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Grad"),
+                      "Two input of ProximalGD Op's dimension must be same.");
+
+    auto lr_dim = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
+                      "Learning Rate should be a scalar.");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+  }
+};
+
+class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ProximalGDOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param",
+             "(Tensor, default Tensor<float>) "
+             "Input parameter value that has to be updated.");
+    AddInput("Grad",
+             "(Tensor, default Tensor<float>) "
+             "Input gradient of the parameter.");
+    AddInput("LearningRate",
+             "(Tensor, default Tensor<float>) "
+             "The learning rate should be a tensor of size 1.");
+
+    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
+
+    AddAttr<float>("l1",
+                   "(float, default 0.0) "
+                   "L1 regularization strength.")
+        .SetDefault(0.0f);
+    AddAttr<float>("l2",
+                   "(float, default 0.0) "
+                   "L2 regularization strength.")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+ProximalGD Operator.
+
+Optimizer that implements the proximal gradient descent algorithm:
+
+$$
+prox\_param = param - learning\_rate * grad \\
+param = sign(prox\_param) / (1 + learning\_rate * l2) *
+        \max(|prox\_param| - learning\_rate * l1, 0)
+$$        
+
+The paper that proposed Proximal Gradient Descent:
+(http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(proximal_gd, ops::ProximalGDOp,
+                             ops::ProximalGDOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    proximal_gd,
+    ops::ProximalGDOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/proximal_gd_op.cu b/paddle/fluid/operators/proximal_gd_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..97b672e872c99783ef4c0cd085e4b86380a06e10
--- /dev/null
+++ b/paddle/fluid/operators/proximal_gd_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+You may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/proximal_gd_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    proximal_gd,
+    ops::ProximalGDOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/proximal_gd_op.h b/paddle/fluid/operators/proximal_gd_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8372380f25277e7774b72e144e00ea80e76a71e0
--- /dev/null
+++ b/paddle/fluid/operators/proximal_gd_op.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class ProximalGDOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param_out = ctx.Output<Tensor>("ParamOut");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+
+    auto grad = ctx.Input<Tensor>("Grad");
+
+    auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
+    auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
+
+    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
+    auto g = EigenVector<T>::Flatten(*grad);
+    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
+
+    auto p_out = EigenVector<T>::Flatten(*param_out);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+
+    auto prox_param = p - lr.broadcast(grad_dsize) * g;
+    if (l1 > 0) {
+      p_out.device(place) =
+          prox_param.sign() *
+          (((prox_param.abs() - (lr * l1).broadcast(grad_dsize))
+                .cwiseMax(T(0.0))) /
+           (1.0 + (lr * l2).broadcast(grad_dsize)));
+    } else {
+      p_out.device(place) =
+          prox_param / (1.0 + (lr * l2).broadcast(grad_dsize));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..222ca73d2acfa8cc3d6fa6a3badce4606be9bcb0
--- /dev/null
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -0,0 +1,129 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/rank_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class RankLossOp : public framework::OperatorWithKernel {
+ public:
+  RankLossOp(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // input check
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null.");
+
+    auto label_dims = ctx->GetInputDim("Label");
+    auto left_dims = ctx->GetInputDim("Left");
+    auto right_dims = ctx->GetInputDim("Right");
+
+    PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims),
+                   "All inputs must have the same size.");
+    PADDLE_ENFORCE(
+        (label_dims.size() == 2) && (label_dims[1] == 1),
+        "All inputs must be 2-D tensors with shape [batch_size x 1].");
+    ctx->SetOutputDim("Out", label_dims);
+  }
+};
+
+class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Label",
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The label indicating A ranked higher than B or not.");
+    AddInput("Left",
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The output of RankNet for doc A.");
+    AddInput("Right",
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The output of RankNet for doc B.");
+    AddOutput("Out",
+              "(2-D Tensor with shape [batch_size x 1]) "
+              "The output loss of RankLoss operator.");
+    AddComment(R"DOC(
+RankLoss Operator.
+
+RankLoss operator for RankNet
+(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf). 
+RankNet is a pairwise ranking model with
+one training sample consisting of a pair of doc A and B, and the label P
+indicating that A is ranked higher than B or not:
+
+P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of
+the input pair.
+
+The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label
+(P_{i,j}), which represent the output score of RankNet for the two docs and 
+the label respectively, and yields the rank loss C_{i,j} using the following 
+equation:
+
+$$
+  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
+  o_{i,j} =  o_i - o_j  \\
+  \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
+$$
+
+The operator can take batch inputs with size batch_size (batch_size >= 1).
+
+)DOC");
+  }
+};
+
+class RankLossGradOp : public framework::OperatorWithKernel {
+ public:
+  RankLossGradOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto dims = ctx->GetInputDim("Left");
+    auto left_grad_name = framework::GradVarName("Left");
+    auto right_grad_name = framework::GradVarName("Right");
+
+    if (ctx->HasOutput(left_grad_name)) {
+      ctx->SetOutputDim(left_grad_name, dims);
+    }
+
+    if (ctx->HasOutput(right_grad_name)) {
+      ctx->SetOutputDim(right_grad_name, dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OP(rank_loss, ops::RankLossOp, ops::RankLossOpMaker, rank_loss_grad,
+            ops::RankLossGradOp);
+REGISTER_OP_CPU_KERNEL(
+    rank_loss, ops::RankLossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    rank_loss_grad,
+    ops::RankLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/rank_loss_op.cu b/paddle/fluid/operators/rank_loss_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1b182ced70d2a234237f1de822c3dc81047ebda7
--- /dev/null
+++ b/paddle/fluid/operators/rank_loss_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/rank_loss_op.h"
+
+REGISTER_OP_CUDA_KERNEL(rank_loss,
+                        paddle::operators::RankLossKernel<
+                            paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(rank_loss_grad,
+                        paddle::operators::RankLossGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/rank_loss_op.h b/paddle/fluid/operators/rank_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..08bb2c28218e8c478af426b560efc9a4b6161696
--- /dev/null
+++ b/paddle/fluid/operators/rank_loss_op.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class RankLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out_t = ctx.Output<framework::Tensor>("Out");
+    auto* label_t = ctx.Input<framework::Tensor>("Label");
+    auto* left_t = ctx.Input<framework::Tensor>("Left");
+    auto* right_t = ctx.Input<framework::Tensor>("Right");
+    out_t->mutable_data<T>(ctx.GetPlace());
+
+    auto out = framework::EigenVector<T>::Flatten(*out_t);
+    auto label = framework::EigenVector<T>::Flatten(*label_t);
+    auto left = framework::EigenVector<T>::Flatten(*left_t);
+    auto right = framework::EigenVector<T>::Flatten(*right_t);
+
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    out.device(dev) =
+        (1. + (left - right).exp()).log() - label * (left - right);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class RankLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_left_t =
+        ctx.Output<framework::Tensor>(framework::GradVarName("Left"));
+    auto* d_right_t =
+        ctx.Output<framework::Tensor>(framework::GradVarName("Right"));
+
+    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* label_t = ctx.Input<framework::Tensor>("Label");
+    auto* left_t = ctx.Input<framework::Tensor>("Left");
+    auto* right_t = ctx.Input<framework::Tensor>("Right");
+
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
+    auto label = framework::EigenVector<T>::Flatten(*label_t);
+    auto left = framework::EigenVector<T>::Flatten(*left_t);
+    auto right = framework::EigenVector<T>::Flatten(*right_t);
+
+    // compute d_left
+    if (d_left_t) {
+      d_left_t->mutable_data<T>(ctx.GetPlace());
+      auto d_left = framework::EigenVector<T>::Flatten(*d_left_t);
+      d_left.device(dev) = d_out * (1. / (1. + (right - left).exp()) - label);
+    }
+    // compute d_right
+    if (d_right_t) {
+      d_right_t->mutable_data<T>(ctx.GetPlace());
+      auto d_right = framework::EigenVector<T>::Flatten(*d_right_t);
+      d_right.device(dev) =
+          -d_out * (1.0 / (1. + (right - left).exp()) - label);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4d562c291911f54c9d1e8fed2e84035808bffbb7
--- /dev/null
+++ b/paddle/fluid/operators/read_op.cc
@@ -0,0 +1,99 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+
+namespace paddle {
+namespace operators {
+
+class ReadInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Reader"),
+                   "The ReadOp must take a reader as input.");
+    PADDLE_ENFORCE(ctx->HasOutputs("Out"),
+                   "The ReadOp should be assigned with output.");
+    std::vector<framework::DDim> reader_dims = ctx->GetReaderDims("Reader");
+    std::vector<std::string> out_names = ctx->Outputs("Out");
+    PADDLE_ENFORCE_EQ(
+        reader_dims.size(), out_names.size(),
+        "The reader's dim number doesn't match the output number.");
+    ctx->SetOutputsDim("Out", reader_dims);
+  }
+};
+
+class ReadInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    std::string reader_name = op_desc.Input("Reader")[0];
+    std::vector<std::string> out_names = op_desc.Output("Out");
+    framework::VarDesc* reader = block->FindVarRecursive(reader_name);
+    auto dtypes = reader->GetDataTypes();
+    PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
+    for (size_t i = 0; i < dtypes.size(); ++i) {
+      framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
+      out.SetType(framework::proto::VarDesc::LOD_TENSOR);
+      out.SetDataType(dtypes[i]);
+    }
+  }
+};
+
+class ReadOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
+    framework::ReaderHolder* reader =
+        scope.FindVar(Input("Reader"))->GetMutable<framework::ReaderHolder>();
+    if (!reader->HasNext()) {
+      reader->ReInit();
+      PADDLE_ENFORCE(
+          reader->HasNext(),
+          "Reader can not read the next data even it has been re-initialized.");
+    }
+    std::vector<std::string> out_arg_names = Outputs("Out");
+    std::vector<framework::LoDTensor> ins;
+    reader->ReadNext(&ins);
+    PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
+    for (size_t i = 0; i < ins.size(); ++i) {
+      auto* out =
+          scope.FindVar(out_arg_names[i])->GetMutable<framework::LoDTensor>();
+      out->ShareDataWith(ins[i]);
+      out->set_lod(ins[i].lod());
+    }
+  }
+};
+
+class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReadOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(op_proto, op_checker) {
+    AddInput("Reader", "(ReaderHolder) The executed reader.");
+    AddOutput("Out", "(LoDTensor) The output data.").AsDuplicable();
+    AddComment(R"DOC(
+      Read Operator
+
+      Execute a given reader once and output data.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(read, ops::ReadOp, ops::ReadInferShape, ops::ReadOpMaker,
+                  paddle::framework::EmptyGradOpMaker, ops::ReadInferVarType);
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e4b9b8dab9b0394752d538aa5f59be3c06d0188f
--- /dev/null
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -0,0 +1,635 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+constexpr char kInputs[] = "inputs";
+constexpr char kInitialStates[] = "initial_states";
+constexpr char kParameters[] = "parameters";
+constexpr char kOutputs[] = "outputs";
+constexpr char kStepScopes[] = "step_scopes";
+constexpr char kExStates[] = "ex_states";
+constexpr char kStates[] = "states";
+constexpr char kStepBlock[] = "sub_block";
+constexpr char kReverse[] = "reverse";
+constexpr char kIsTrain[] = "is_train";
+#define GRAD_SUFFIX "@GRAD"
+constexpr char kInputGrads[] = "inputs" GRAD_SUFFIX;
+constexpr char kOutputGrads[] = "outputs" GRAD_SUFFIX;
+constexpr char kParamGrads[] = "parameters" GRAD_SUFFIX;
+constexpr char kInitStateGrads[] = "initial_states" GRAD_SUFFIX;
+
+using StepScopeVar = std::vector<framework::Scope *>;
+
+// StepScopes manages scopes inside RNN.
+//    StepScopes::CurScope() get the current scope
+//    StepScopes::ExScope() get the ex-scope, or scope in previous time step.
+//    StepScopes::Next() move to next time step.
+//
+// if is_train = False, then
+//   there are two scopes for the RNN and just support forward.
+// else
+//   the len(scopes) == seq_len
+//
+// if is_backward = True, then
+//   reversely access scopes
+// else
+//   access scopes from begin to end.
+class StepScopes {
+ public:
+  StepScopes(const framework::Scope &parent, StepScopeVar *scopes,
+             bool is_train, size_t seq_len, bool is_backward = false)
+      : counter_(is_backward ? seq_len - 1 : 0UL),
+        scopes_(scopes),
+        is_train_(is_train),
+        is_backward_(is_backward) {
+    size_t num_step_scopes = is_train ? seq_len : 2;
+    PADDLE_ENFORCE(is_train || !is_backward,
+                   "Cannot backward when is not training");
+    if (!is_backward_) {
+      PADDLE_ENFORCE(scopes->empty());
+      scopes->reserve(static_cast<size_t>(num_step_scopes));
+      for (size_t i = 0; i < num_step_scopes; ++i) {
+        scopes->emplace_back(&parent.NewScope());
+      }
+    }
+  }
+
+  framework::Scope &CurScope() { return GetScope(counter_); }
+
+  framework::Scope &ExScope() {
+    auto &scope = GetScope(is_backward_ ? counter_ + 1 : counter_ - 1);
+    return scope;
+  }
+
+  void Next() {
+    if (is_backward_) {
+      --counter_;
+    } else {
+      ++counter_;
+    }
+  }
+
+ private:
+  framework::Scope &GetScope(size_t scope_id) const {
+    if (!is_train_) {
+      scope_id %= 2;
+    }
+    PADDLE_ENFORCE_LT(scope_id, scopes_->size());
+    return *(*scopes_)[scope_id];
+  }
+
+  size_t counter_;
+  StepScopeVar *scopes_;
+  bool is_train_;
+  bool is_backward_;
+};
+
+// Base class for RecurrentOp/RecurrentGradOp
+//    Some common protected functions for RecurrentOp/RecurrentGradOp
+class RecurrentBase : public framework::OperatorBase {
+ public:
+  RecurrentBase(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  // Get SequenceLength from Scope
+  //   The sequence length is got from input tensor. The input tensor's
+  //   dimension should be [SEQ_LEN, ..., ...]. The first of the tensor's shape
+  //   is SEQ_LEN. The second of the tensor's shape could be the batch size or
+  //   nested sequence length.
+  int64_t GetSequenceLength(const framework::Scope &scope) const {
+    // Dim format SEQ_LEN, BATCH_SIZE, ...
+    int64_t seq_len = -1;
+    auto &all_inputs = Inputs(kInputs);
+    PADDLE_ENFORCE(!all_inputs.empty());
+    for (auto &iname : all_inputs) {
+      auto *var = scope.FindVar(iname);
+      PADDLE_ENFORCE(var != nullptr);
+      PADDLE_ENFORCE(var->IsType<framework::LoDTensor>());
+      auto &dim = var->Get<framework::LoDTensor>().dims();
+      if (seq_len == -1) {
+        seq_len = dim[0];
+      } else {
+        PADDLE_ENFORCE_EQ(seq_len, dim[0]);
+      }
+    }
+    return seq_len;
+  }
+
+  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
+  //                                   map(dst_scope.Var, dst_vars)):
+  //   dst_tensor.ShareDataWith(src_tensor)
+  static void LinkTensor(const framework::Scope &src_scope,
+                         const std::vector<std::string> &src_vars,
+                         framework::Scope *dst_scope,
+                         const std::vector<std::string> &dst_vars) {
+    LinkTensorWithCallback(
+        src_scope, src_vars, dst_scope, dst_vars,
+        [&](const framework::Tensor &src, framework::Tensor *dst) {
+          dst->ShareDataWith(src);
+        });
+  }
+
+  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
+  //                                   map(dst_scope.Var, dst_vars)):
+  //   callback(src_tensor, &dst_tensor)
+  template <typename Callback>
+  static void LinkTensorWithCallback(const framework::Scope &src_scope,
+                                     const std::vector<std::string> &src_vars,
+                                     framework::Scope *dst_scope,
+                                     const std::vector<std::string> &dst_vars,
+                                     Callback callback) {
+    PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
+    for (size_t i = 0; i < dst_vars.size(); ++i) {
+      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
+      AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback);
+    }
+  }
+
+  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
+  //                                   map(dst_scope.FindVar, dst_vars)):
+  //   callback(src_tensor, &dst_tensor)
+  template <typename Callback>
+  static void LinkTensorWithCallback(const framework::Scope &src_scope,
+                                     const std::vector<std::string> &src_vars,
+                                     const framework::Scope &dst_scope,
+                                     const std::vector<std::string> &dst_vars,
+                                     Callback callback) {
+    PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
+    for (size_t i = 0; i < dst_vars.size(); ++i) {
+      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
+      AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback);
+    }
+  }
+
+  // (seq_len, shape) -> return [seq_len] + list(shape)
+  static framework::DDim PrependDims(size_t seq_len,
+                                     const framework::DDim &src) {
+    auto dims = framework::vectorize(src);
+    dims.insert(dims.begin(), static_cast<int64_t>(seq_len));
+    return framework::make_ddim(dims);
+  }
+
+ private:
+  template <typename Callback>
+  static void AccessTensor(const framework::Scope &src_scope,
+                           const std::string &src_var_name,
+                           framework::Scope *dst_scope,
+                           const std::string &dst_var_name, Callback callback) {
+    auto *src_var = src_scope.FindVar(src_var_name);
+    PADDLE_ENFORCE(src_var != nullptr);
+    auto &src_tensor = src_var->Get<framework::LoDTensor>();
+
+    auto *dst_var = dst_scope->Var(dst_var_name);
+    auto *dst_tensor = dst_var->GetMutable<framework::LoDTensor>();
+    callback(src_tensor, dst_tensor);
+  }
+
+  template <typename Callback>
+  static void AccessTensor(const framework::Scope &src_scope,
+                           const std::string &src_var_name,
+                           const framework::Scope &dst_scope,
+                           const std::string &dst_var_name, Callback callback) {
+    auto *src_var = src_scope.FindVar(src_var_name);
+    PADDLE_ENFORCE(src_var != nullptr);
+    auto &src_tensor = src_var->Get<framework::LoDTensor>();
+    auto *dst_var = dst_scope.FindVar(dst_var_name);
+    PADDLE_ENFORCE(dst_var != nullptr);
+    auto *dst_tensor = dst_var->GetMutable<framework::LoDTensor>();
+    callback(src_tensor, dst_tensor);
+  }
+};
+
+class RecurrentOp : public RecurrentBase {
+ public:
+  RecurrentOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : RecurrentBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
+    VLOG(3) << "Static RNN input sequence length = " << seq_len;
+    StepScopes scopes = CreateStepScopes(scope, seq_len);
+    auto reverse = Attr<bool>(kReverse);
+
+    framework::Executor executor(place);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
+    auto *program = block->Program();
+
+    for (size_t i = 0; i < seq_len; ++i) {
+      size_t seq_offset = reverse ? seq_len - i - 1 : i;
+      VLOG(3) << "Recurrent operate at the time step " << seq_offset;
+
+      auto &cur_scope = scopes.CurScope();
+
+      // Link outside::input --> inside::input
+      //   inside::input = outside::input[seq_offset: seq_offset+1]
+      LinkTensorWithCallback(
+          scope, Inputs(kInputs), &cur_scope, Inputs(kInputs),
+          [&seq_offset](const framework::Tensor &outside,
+                        framework::Tensor *inside) {
+            inside->ShareDataWith(outside.Slice(seq_offset, seq_offset + 1));
+            auto dims = framework::vectorize(inside->dims());
+            dims.erase(dims.begin());
+            inside->Resize(framework::make_ddim(dims));
+          });
+
+      if (i == 0) {
+        // Link initial states  --> ex_states
+        LinkTensor(scope, Inputs(kInitialStates), &cur_scope,
+                   Attr<std::vector<std::string>>(kExStates));
+      } else {
+        auto &ex_scope = scopes.ExScope();
+        // Link ex_scope::state --> cur_scope::ex_state
+        LinkTensor(ex_scope, Attr<std::vector<std::string>>(kStates),
+                   &cur_scope, Attr<std::vector<std::string>>(kExStates));
+      }
+
+      // Every inputs are linked now, execute!
+      executor.Run(*program, &cur_scope, block->ID(),
+                   false /*create_local_scope*/);
+
+      // get device context from pool
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
+
+      // Copy inside::output -> outside::output
+      //    outside::output[seq_offset: seq_offset + 1] = inside::output
+      this->LinkTensorWithCallback(
+          cur_scope, Outputs(kOutputs), scope, Outputs(kOutputs),
+          [&](const framework::LoDTensor &src_tensor,
+              framework::LoDTensor *dst_tensor) {
+            if (i == 0) {  // create output tensor at begin
+              dst_tensor->Resize(PrependDims(seq_len, src_tensor.dims()));
+              dst_tensor->mutable_data(place, src_tensor.type());
+            }
+
+            auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
+            // Explicit copy output since the local RNN scope can be destroyed
+            // early.
+            framework::Copy(src_tensor, place, dev_ctx, &dst_out);
+          });
+
+      scopes.Next();
+    }
+  }
+
+ private:
+  StepScopes CreateStepScopes(const framework::Scope &scope,
+                              size_t seq_len) const {
+    auto *var = scope.FindVar(Output(kStepScopes));
+    PADDLE_ENFORCE(var != nullptr);
+    return StepScopes(scope, var->GetMutable<StepScopeVar>(),
+                      Attr<bool>(kIsTrain), seq_len);
+  }
+};
+
+class RecurrentGradOp : public RecurrentBase {
+ public:
+  RecurrentGradOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : RecurrentBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto seq_len = static_cast<size_t>(GetSequenceLength(scope));
+    StepScopes scopes = CreateStepScopes(scope, seq_len);
+    auto reverse = Attr<bool>(kReverse);
+
+    framework::Executor executor(place);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
+    auto *program = block->Program();
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    for (size_t step_id = 0; step_id < seq_len; ++step_id) {
+      size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
+      VLOG(3) << "Recurrent backward operate at the time step " << seq_offset;
+      auto &cur_scope = scopes.CurScope();
+      // Link outside::output_grads --> inside::output_grads
+      //   inside::output_grad = outside::output_grad[seq_offset:seq_offset+1]
+      LinkTensorWithCallback(
+          scope, Inputs(kOutputGrads), &cur_scope, Inputs(kOutputGrads),
+          [&](const framework::Tensor &outside, framework::Tensor *inside) {
+            inside->ShareDataWith(outside.Slice(seq_offset, seq_offset + 1));
+            auto dims = framework::vectorize(inside->dims());
+            dims.erase(dims.begin());
+            inside->Resize(framework::make_ddim(dims));
+          });
+      auto og_set = List2Set(Inputs(kOutputGrads));
+
+      if (VLOG_IS_ON(10)) {
+        std::ostringstream sout;
+        std::copy(og_set.begin(), og_set.end(),
+                  std::ostream_iterator<std::string>(sout, ","));
+        VLOG(10) << " RNN output gradients = [" << sout.str() << "]";
+      }
+
+      // Link states
+      //   if cur_scope::cur_state_grad in out_grads:
+      //     cur_scope::cur_state_grad += ex_scope::ex_state_grad
+      //   else:
+      //     ex_scope::ex_state_grad --> cur_scope::cur_state_grad
+      if (step_id != 0) {  // not at beginning
+        auto &ex_scope = scopes.ExScope();
+        auto ex_state_grads =
+            GradVarLists(Attr<std::vector<std::string>>(kExStates));
+        auto cur_state_grads =
+            GradVarLists(Attr<std::vector<std::string>>(kStates));
+
+        PADDLE_ENFORCE_EQ(ex_state_grads.size(), cur_state_grads.size());
+        for (size_t i = 0; i < ex_state_grads.size(); ++i) {
+          auto &cur_grad = cur_state_grads[i];
+          auto &ex_grad = ex_state_grads[i];
+          auto &ex_tensor =
+              ex_scope.FindVar(ex_grad)->Get<framework::LoDTensor>();
+
+          VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad;
+          auto *cur_grad_var = cur_scope.Var(cur_grad);
+          auto cur_grad_tensor =
+              cur_grad_var->GetMutable<framework::LoDTensor>();
+          framework::Copy(ex_tensor, place, dev_ctx, cur_grad_tensor);
+        }
+      }
+
+      VLOG(5) << "Recurrent memory linking finished ";
+      // Run step block with cur_scope
+      executor.Run(*program, &cur_scope, block->ID(),
+                   false /*create_local_scope*/);
+
+      VLOG(5) << "executor.Run finished ";
+
+      auto local_var_names = LocalVarNames(cur_scope);
+
+      // Accumulate params
+      //   if (step == 0):
+      //      outside::param_grad = 0.0
+      //   outside::param_grad += inside::param_grad
+      {
+        auto &pg_names = Outputs(kParamGrads);
+        auto &p_names = Inputs(kParameters);
+        PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
+
+        for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
+          auto inside_grad_name = framework::GradVarName(p_names[param_id]);
+
+          // If does not compute gradient of that variable inside rnn, just
+          // continue
+          if (local_var_names.find(inside_grad_name) == local_var_names.end()) {
+            continue;
+          }
+
+          // zero gradient variable in step 0
+          if (step_id == 0) {
+            auto &inside_tensor = cur_scope.FindVar(inside_grad_name)
+                                      ->Get<framework::LoDTensor>();
+            framework::AttributeMap attrs;
+            attrs["dtype"] = framework::ToDataType(inside_tensor.type());
+            attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
+            attrs["value"] = 0.0f;
+
+            auto zero_op = framework::OpRegistry::CreateOp(
+                "fill_constant", framework::VariableNameMap{},
+                {{"Out", {pg_names[param_id]}}}, attrs);
+            zero_op->Run(scope, place);
+          }
+
+          auto new_inside_name = cur_scope.Rename(inside_grad_name);
+          // sum gradient
+
+          auto sum_op = framework::OpRegistry::CreateOp(
+              "sum", {{"X", {pg_names[param_id], new_inside_name}}},
+              {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
+          sum_op->Run(cur_scope, place);
+
+          cur_scope.Rename(new_inside_name, inside_grad_name);
+        }
+      }
+      VLOG(5) << "Accumulate Parameter finished ";
+
+      // Copy input gradient from inside to outside
+      //   outside::input_grad[seq_offset: seq_offset + 1] = inside::input_grad
+      LinkTensorWithCallback(
+          cur_scope, GradVarLists(Inputs(kInputs)), scope, Outputs(kInputGrads),
+          [&](const framework::LoDTensor &inside,
+              framework::LoDTensor *outside) {
+            if (inside.memory_size() == 0) {  // IG is not created.
+              return;
+            }
+            if (step_id == 0) {  // alloc memory
+              outside->Resize(PrependDims(seq_len, inside.dims()));
+              outside->mutable_data(place, inside.type());
+            }
+
+            auto dst = outside->Slice(seq_offset, seq_offset + 1);
+            framework::Copy(inside, place, dev_ctx, &dst);
+          });
+      VLOG(5) << "Link outside gradient finished ";
+
+      if (step_id + 1 == seq_len) {  // at_end
+        // copy initialize states gradient from inside to outside
+        LinkTensorWithCallback(
+            cur_scope, GradVarLists(Attr<std::vector<std::string>>(kExStates)),
+            scope, Outputs(kInitStateGrads),
+            [&](const framework::LoDTensor &inside,
+                framework::LoDTensor *outside) {
+              outside->Resize(inside.dims());
+              outside->mutable_data(place, inside.type());
+              framework::Copy(inside, place, dev_ctx, outside);
+            });
+        VLOG(5) << "Link initialize state gradient finished ";
+      }
+      scopes.Next();
+    }
+  }
+
+ private:
+  StepScopes CreateStepScopes(const framework::Scope &scope,
+                              size_t seq_len) const {
+    auto *var = scope.FindVar(Input(kStepScopes));
+    PADDLE_ENFORCE(var != nullptr);
+    return StepScopes(scope, var->GetMutable<StepScopeVar>(),
+                      Attr<bool>(kIsTrain), seq_len, true /*is_backward*/);
+  }
+
+  std::unordered_set<std::string> List2Set(
+      const std::vector<std::string> &list) const {
+    std::unordered_set<std::string> local_var_name_set;
+    local_var_name_set.reserve(list.size());
+    for (auto &each : list) {
+      local_var_name_set.insert(each);
+    }
+    return local_var_name_set;
+  }
+
+  std::unordered_set<std::string> LocalVarNames(
+      const framework::Scope &scope) const {
+    return this->List2Set(scope.LocalVarNames());
+  }
+  static std::vector<std::string> GradVarLists(
+      const std::vector<std::string> &var_names) {
+    std::vector<std::string> retv;
+    retv.reserve(var_names.size());
+    std::transform(var_names.begin(), var_names.end(), std::back_inserter(retv),
+                   framework::GradVarName);
+    return retv;
+  }
+};
+
+class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RecurrentOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kInputs, "rnn inputs").AsDuplicable();
+    AddInput(kInitialStates, "rnn initial states").AsDuplicable();
+    AddInput(kParameters,
+             "Parameters are used by step block as its input. However, the "
+             "input is not a sequence tensor. Every time step, each operator "
+             "in step block just use the parameter directly.")
+        .AsDuplicable();
+    AddOutput(kOutputs,
+              "The output sequence of RNN. The sequence length must be same.")
+        .AsDuplicable();
+    AddOutput(kStepScopes,
+              "StepScopes contain all local variables in each time step.");
+    AddAttr<std::vector<std::string>>(kExStates,
+                                      string::Sprintf(
+                                          R"DOC(The ex-state variable names.
+The ex-state means the state value in the ex-timestep or the previous time step
+[%s, %s, %s] must be the same order)DOC",
+                                          kExStates, kStates, kInitStateGrads));
+    AddAttr<std::vector<std::string>>(
+        kStates,
+        string::Sprintf(
+            "The state variable names. [%s, %s, %s] must be the same order",
+            kExStates, kStates, kInitStateGrads));
+    AddAttr<framework::BlockDesc *>(kStepBlock, "The step block inside RNN");
+    AddAttr<bool>(kReverse, R"DOC(Calculate RNN reversely or not.
+By default reverse=False
+
+Assume the input data is [A, B, C, D]
+
+if reverse is False:
+  the computation of RNN is like
+      A          B          C         D
+      |          |          |         |
+      v          v          v         v
+     rnn -----> rnn -----> rnn ----> rnn
+      |          |          |         |
+      v          v          v         v
+      o          o          o         o
+
+if reverse is True
+  the computation of RNN is like
+      A          B          C         D
+      |          |          |         |
+      v          v          v         v
+     rnn <----- rnn <----- rnn <---- rnn
+      |          |          |         |
+      v          v          v         v
+      o          o          o         o
+)DOC").SetDefault(false);
+    AddAttr<bool>(kIsTrain, "").SetDefault(true);
+    AddComment(R"DOC(
+Static Length Recurrent Operator.
+
+The static length recurrent operator can only operate on fixed size sequence
+data, i.e. in each mini-batch, the sequence length of all inputs are the same.
+
+)DOC");
+  }
+};
+
+class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  virtual std::unique_ptr<framework::OpDesc> Apply() const {
+    auto *grad = new framework::OpDesc();
+    grad->SetType("recurrent_grad");
+    for (auto &input_param : this->InputNames()) {
+      grad->SetInput(input_param, this->Input(input_param));
+      grad->SetOutput(framework::GradVarName(input_param),
+                      this->InputGrad(input_param, false));
+    }
+
+    for (auto &output_param : this->OutputNames()) {
+      if (output_param == kStepScopes) {
+        grad->SetInput(output_param, this->Output(output_param));
+        grad->SetInput(framework::GradVarName(output_param),
+                       this->Output(output_param));
+      } else {
+        grad->SetInput(output_param, this->Output(output_param));
+        grad->SetInput(framework::GradVarName(output_param),
+                       this->OutputGrad(output_param));
+      }
+    }
+    grad->SetAttrMap(this->Attrs());
+    grad->SetBlockAttr(kStepBlock, *grad_block_[0]);
+
+    return std::unique_ptr<framework::OpDesc>(grad);
+  }
+};
+
+class RecurrentGradOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    std::vector<std::string> input{kInputs, kInitialStates};
+    std::vector<std::string> output{kOutputs};
+    for (auto &s : input) {
+      PADDLE_ENFORCE(ctx->HasInputs(s));
+      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)),
+                     "Cannot find the gradient variable %s",
+                     framework::GradVarName(s));
+    }
+    for (auto &s : output) {
+      PADDLE_ENFORCE(ctx->HasInputs(s));
+    }
+    for (auto &s : input) {
+      ctx->SetOutputsDim(framework::GradVarName(s), ctx->GetInputsDim(s));
+    }
+    if (ctx->HasInputs(kParameters)) {
+      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
+      ctx->SetOutputsDim(framework::GradVarName(kParameters),
+                         ctx->GetInputsDim(kParameters));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(recurrent, paddle::operators::RecurrentOp,
+                  paddle::operators::RecurrentOpProtoMaker,
+                  paddle::operators::RecurrentGradOpDescMaker);
+REGISTER_OPERATOR(recurrent_grad, paddle::operators::RecurrentGradOp,
+                  paddle::operators::RecurrentGradOpShapeInference);
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c093f60ceed4171ee4ab7f0e5757af2ee5950270
--- /dev/null
+++ b/paddle/fluid/operators/recv_op.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <ostream>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#include <future>
+#include "paddle/fluid/operators/detail/grpc_client.h"
+
+namespace paddle {
+namespace operators {
+
+class RecvOp : public framework::OperatorBase {
+ public:
+  RecvOp(const std::string& type, const framework::VariableNameMap& inputs,
+         const framework::VariableNameMap& outputs,
+         const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope& scope,
+           const platform::Place& place) const override {
+    auto outs = Outputs("Out");
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(place);
+
+    for (size_t i = 0; i < outs.size(); i++) {
+      VLOG(3) << "getting " << outs[i];
+      client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
+    }
+    PADDLE_ENFORCE(client_.Wait());
+  }
+
+ private:
+  mutable detail::RPCClient client_;
+};
+
+class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RecvOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable();
+    AddComment(R"DOC(
+Recv operator
+
+This operator can get variables from server side.
+)DOC");
+    AddAttr<std::vector<std::string>>("epmap",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints in the order of input "
+                                      "variables for mapping")
+        .SetDefault({});
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker);
diff --git a/paddle/fluid/operators/reduce_op.cc b/paddle/fluid/operators/reduce_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f4d9d4cc07b1f76ed04e17bc1cc65293163fb6f2
--- /dev/null
+++ b/paddle/fluid/operators/reduce_op.cc
@@ -0,0 +1,214 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class ReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReduceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReduceOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
+    int dim = ctx->Attrs().Get<int>("dim");
+    if (dim < 0) dim = x_rank + dim;
+    PADDLE_ENFORCE_LT(
+        dim, x_rank,
+        "The dim should be in the range [-rank(input), rank(input)).");
+    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
+    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
+    if (reduce_all) {
+      if (keep_dim)
+        ctx->SetOutputDim(
+            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
+      else
+        ctx->SetOutputDim("Out", {1});
+    } else {
+      auto dims_vector = vectorize(x_dims);
+      if (keep_dim || x_rank == 1) {
+        dims_vector[dim] = 1;
+      } else {
+        dims_vector.erase(dims_vector.begin() + dim);
+      }
+      auto out_dims = framework::make_ddim(dims_vector);
+      ctx->SetOutputDim("Out", out_dims);
+      if (dim != 0) {
+        // Only pass LoD when not reducing on the first dim.
+        ctx->ShareLoD("X", /*->*/ "Out");
+      }
+    }
+  }
+};
+
+class ReduceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
+    int dim = ctx->Attrs().Get<int>("dim");
+    if (dim < 0) dim = x_rank + dim;
+    PADDLE_ENFORCE_LT(
+        dim, x_rank,
+        "The dim should be in the range [-rank(input), rank(input)).");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+      ctx->ShareLoD("X", /*->*/ x_grad_name);
+    }
+  }
+};
+
+class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor) The input tensor. Tensors with rank at most 6 are "
+             "supported.");
+    AddOutput("Out", "(Tensor) The result tensor.");
+    AddAttr<int>(
+        "dim",
+        "(int, default 0) The dimension to reduce. "
+        "Must be in the range [-rank(input), rank(input)). "
+        "If `dim < 0`, the dim to reduce is `rank + dim`. "
+        "Note that reducing on the first dim will make the LoD info lost.")
+        .SetDefault(0);
+    AddAttr<bool>("keep_dim",
+                  "(bool, default false) "
+                  "If true, retain the reduced dimension with length 1.")
+        .SetDefault(false);
+    AddAttr<bool>("reduce_all",
+                  "(bool, default false) "
+                  "If true, output a scalar reduced along all dimensions.")
+        .SetDefault(false);
+    comment_ = R"DOC(
+{ReduceOp} Operator.
+
+This operator computes the {reduce} of input tensor along the given dimension. 
+The result tensor has 1 fewer dimension than the input unless keep_dim is true.
+If reduce_all is true, just reduce along all dimensions and output a scalar.
+
+)DOC";
+    AddComment(comment_);
+  }
+
+ protected:
+  std::string comment_;
+
+  void Replace(std::string &src, std::string from, std::string to) {
+    std::size_t len_from = std::strlen(from.c_str());
+    std::size_t len_to = std::strlen(to.c_str());
+    for (std::size_t pos = src.find(from); pos != std::string::npos;
+         pos = src.find(from, pos + len_to)) {
+      src.replace(pos, len_from, to);
+    }
+  }
+
+  void SetComment(std::string name, std::string op) {
+    Replace(comment_, "{ReduceOp}", name);
+    Replace(comment_, "{reduce}", op);
+  }
+};
+
+class ReduceSumOpMaker : public ReduceOpMaker {
+ public:
+  ReduceSumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : ReduceOpMaker(proto, op_checker) {
+    SetComment("ReduceSum", "sum");
+    AddComment(comment_);
+  }
+};
+
+class ReduceMeanOpMaker : public ReduceOpMaker {
+ public:
+  ReduceMeanOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : ReduceOpMaker(proto, op_checker) {
+    SetComment("ReduceMean", "mean");
+    AddComment(comment_);
+  }
+};
+
+class ReduceMaxOpMaker : public ReduceOpMaker {
+ public:
+  ReduceMaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : ReduceOpMaker(proto, op_checker) {
+    SetComment("ReduceMax", "max");
+    AddComment(comment_);
+  }
+};
+
+class ReduceMinOpMaker : public ReduceOpMaker {
+ public:
+  ReduceMinOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : ReduceOpMaker(proto, op_checker) {
+    SetComment("ReduceMin", "min");
+    AddComment(comment_);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(reduce_sum, ops::ReduceOp, ops::ReduceSumOpMaker, reduce_sum_grad,
+            ops::ReduceGradOp);
+
+REGISTER_OP(reduce_mean, ops::ReduceOp, ops::ReduceMeanOpMaker,
+            reduce_mean_grad, ops::ReduceGradOp);
+
+REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad,
+            ops::ReduceGradOp);
+
+REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad,
+            ops::ReduceGradOp);
+
+#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)         \
+  REGISTER_OP_CPU_KERNEL(reduce_type,                                          \
+                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
+                                           float, ops::functor>,               \
+                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
+                                           double, ops::functor>,              \
+                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
+                                           int, ops::functor>,                 \
+                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
+                                           int64_t, ops::functor>);            \
+  REGISTER_OP_CPU_KERNEL(                                                      \
+      reduce_type##_grad,                                                      \
+      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float,         \
+                            ops::grad_functor>,                                \
+      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,        \
+                            ops::grad_functor>,                                \
+      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,           \
+                            ops::grad_functor>,                                \
+      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,       \
+                            ops::grad_functor>);
+
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL);
diff --git a/paddle/fluid/operators/reduce_op.cu b/paddle/fluid/operators/reduce_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1ca107ebfe9b617bd5e952965543549a8d92a5b1
--- /dev/null
+++ b/paddle/fluid/operators/reduce_op.cu
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace ops = paddle::operators;
+
+#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor)    \
+  REGISTER_OP_CUDA_KERNEL(                                                \
+      reduce_type, ops::ReduceKernel<paddle::platform::CUDADeviceContext, \
+                                     float, ops::functor>,                \
+      ops::ReduceKernel<paddle::platform::CUDADeviceContext, double,      \
+                        ops::functor>,                                    \
+      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int,         \
+                        ops::functor>,                                    \
+      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int64_t,     \
+                        ops::functor>);                                   \
+  REGISTER_OP_CUDA_KERNEL(                                                \
+      reduce_type##_grad,                                                 \
+      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,   \
+                            ops::grad_functor>,                           \
+      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,  \
+                            ops::grad_functor>,                           \
+      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,     \
+                            ops::grad_functor>,                           \
+      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t, \
+                            ops::grad_functor>);
+
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL);
diff --git a/paddle/fluid/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a153cf272b5dd8abcba1bc7d3d02c480702eae4d
--- /dev/null
+++ b/paddle/fluid/operators/reduce_op.h
@@ -0,0 +1,257 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+struct SumFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
+    y.device(place) = x.sum(dim);
+  }
+};
+
+struct SumGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
+                  const Dim& dim, int size) {
+    dx.device(place) = dy.broadcast(dim);
+  }
+};
+
+struct MeanFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
+    y.device(place) = x.mean(dim);
+  }
+};
+
+struct MeanGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
+                  const Dim& dim, int size) {
+    dx.device(place) = dy.broadcast(dim) / dx.constant(size);
+  }
+};
+
+struct MaxFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
+    y.device(place) = x.maximum(dim);
+  }
+};
+
+struct MinFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
+    y.device(place) = x.minimum(dim);
+  }
+};
+
+struct MaxOrMinGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
+                  const Dim& dim, int size) {
+    auto equals = x == y.broadcast(dim);
+    auto ones = dx.constant(1);
+    auto zeros = dx.constant(0);
+    // If there are multiple minimum or maximum elements, the subgradient of
+    // each is the set [0, 1], and we pass gradient to all of them here.
+    dx.device(place) = dy.broadcast(dim) * equals.select(ones, zeros);
+  }
+};
+
+template <typename DeviceContext, typename T, typename Functor>
+class ReduceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    if (reduce_all) {
+      // Flatten and reduce 1-D tensor
+      auto* input = context.Input<Tensor>("X");
+      auto* output = context.Output<Tensor>("Out");
+      output->mutable_data<T>(context.GetPlace());
+      auto x = EigenVector<T>::Flatten(*input);
+      auto out = EigenScalar<T>::From(*output);
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      auto reduce_dim = Eigen::array<int, 1>({{0}});
+      Functor functor;
+      functor(place, x, out, reduce_dim);
+    } else {
+      int rank = context.Input<Tensor>("X")->dims().size();
+      switch (rank) {
+        case 1:
+          ReduceCompute<1>(context);
+          break;
+        case 2:
+          ReduceCompute<2>(context);
+          break;
+        case 3:
+          ReduceCompute<3>(context);
+          break;
+        case 4:
+          ReduceCompute<4>(context);
+          break;
+        case 5:
+          ReduceCompute<5>(context);
+          break;
+        case 6:
+          ReduceCompute<6>(context);
+          break;
+      }
+    }
+  }
+
+ private:
+  template <size_t D>
+  void ReduceCompute(const framework::ExecutionContext& context) const {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+
+    auto x = EigenTensor<T, D>::From(*input);
+    auto x_rank = static_cast<int>(x.dimensions().size());
+    int dim = static_cast<int>(context.Attr<int>("dim"));
+    if (dim < 0) dim = x_rank + dim;
+    auto reduce_dim = Eigen::array<int, 1>({{dim}});
+    // construct the squeezed output tensor
+    bool keep_dim = context.Attr<bool>("keep_dim");
+    DDim dims = output->dims();
+    auto dims_vector = vectorize(dims);
+    if (keep_dim && x_rank > 1) {
+      dims_vector.erase(dims_vector.begin() + dim);
+      dims = framework::make_ddim(dims_vector);
+    }
+
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    Functor functor;
+
+    if (D == 1) {
+      auto out = EigenScalar<T>::From(*output);
+      functor(place, x, out, reduce_dim);
+    } else {
+      auto out = EigenTensor<T, (D - 1)>::From(*output, dims);
+      functor(place, x, out, reduce_dim);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T, typename Functor>
+class ReduceGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    if (reduce_all) {
+      auto* input0 = context.Input<Tensor>("X");
+      auto* input1 = context.Input<Tensor>("Out");
+      auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+      output->mutable_data<T>(context.GetPlace());
+      auto x = EigenVector<T>::Flatten(*input0);
+      auto x_reduce = EigenVector<T>::From(*input1);
+      auto x_reduce_grad = EigenVector<T>::From(*input2);
+      auto x_grad = EigenVector<T>::Flatten(*output);
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      auto broadcast_dim =
+          Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
+      Functor functor;
+      functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
+              broadcast_dim[0]);
+    } else {
+      int rank = context.Input<Tensor>("X")->dims().size();
+      switch (rank) {
+        case 1:
+          ReduceGradCompute<1>(context);
+          break;
+        case 2:
+          ReduceGradCompute<2>(context);
+          break;
+        case 3:
+          ReduceGradCompute<3>(context);
+          break;
+        case 4:
+          ReduceGradCompute<4>(context);
+          break;
+        case 5:
+          ReduceGradCompute<5>(context);
+          break;
+        case 6:
+          ReduceGradCompute<6>(context);
+          break;
+      }
+    }
+  }
+
+ private:
+  template <size_t D>
+  void ReduceGradCompute(const framework::ExecutionContext& context) const {
+    auto* input0 = context.Input<Tensor>("X");
+    auto* input1 = context.Input<Tensor>("Out");
+    auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+
+    output->mutable_data<T>(context.GetPlace());
+    auto x = EigenTensor<T, D>::From(*input0);
+    auto x_grad = EigenTensor<T, D>::From(*output);
+    auto x_rank = static_cast<int>(x.dimensions().size());
+    int dim = static_cast<int>(context.Attr<int>("dim"));
+    if (dim < 0) dim = x_rank + dim;
+    DDim dims = input0->dims();
+    dims[dim] = 1;
+    auto x_reduce = EigenTensor<T, D>::From(*input1, dims);
+    auto x_reduce_grad = EigenTensor<T, D>::From(*input2, dims);
+
+    Eigen::array<int, D> broadcast_dim;
+    for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
+    broadcast_dim[dim] = input0->dims()[dim];
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    Functor functor;
+    functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
+            broadcast_dim[dim]);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define FOR_EACH_KERNEL_FUNCTOR(__macro)                \
+  __macro(reduce_sum, SumFunctor, SumGradFunctor);      \
+  __macro(reduce_mean, MeanFunctor, MeanGradFunctor);   \
+  __macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \
+  __macro(reduce_min, MinFunctor, MaxOrMinGradFunctor);
diff --git a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..148a65bb4b7fe599f2fdb833c179665e58fe1c41
--- /dev/null
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
@@ -0,0 +1,270 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+class ReorderLoDTensorByRankTableOpProtoMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReorderLoDTensorByRankTableOpProtoMaker(OpProto *proto,
+                                          OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor), the input lod tensor to be reordered according to "
+             "Input(RankTable).");
+    AddInput("RankTable",
+             "(LoDRankTable), the rank table according to which Input(X) is "
+             "reordered.");
+    AddOutput("Out", "(LoDTensor), the reordered lod tensor.");
+    AddComment(R"DOC(ReorderLoDTensorByRankTable operator.
+
+Input(X) is a batch of sequences. Input(RankTable) stores new orders of the
+input sequence batch. The reorder_lod_tensor_by_rank operator reorders the
+Input(X) according to the information provided by Input(RankTable).
+
+For example:
+
+If the indices stored in the Input(RankTable) are [3, 0, 2, 1], the
+Input(X) will be reordered that the fourth sequence in Input(X) will become the
+first one, and then followed by the original first, third, and the second one.
+
+This is:
+X = [Seq0, Seq1, Seq2, Seq3]. The indices in RankTable are [3, 0, 2, 1].
+Out =  [Seq3, Seq0, Seq2, Seq1] with a new LoD information.
+
+If the LoD information of Input(X) is empty, this means Input(X) is not sequence
+data. This is also identical to a batch of sequences where each sequence has a
+fixed length 1. In this case, the reorder_lod_tensor_by_rank operator reorders
+each slice of Input(X) along the first axis according to Input(RankTable).
+
+This is:
+X = [Slice0, Slice1, Slice2, Slice3] and its LoD information is empty. The
+indices in RankTable are [3, 0, 2, 1].
+Out = [Slice3, Slice0, Slice2, Slice1] with no LoD information is appended.
+
+NOTE: This operator sorts Input(X) according to a given LoDRankTable which does
+not need to be calculated according to Input(X). It can be calculated according
+to another different sequence, and then this operator sorts Input(X) according
+to the given LoDRankTable.
+
+)DOC");
+  }
+};
+
+class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
+ public:
+  ReorderLoDTensorByRankTableBase(const std::string &type,
+                                  const framework::VariableNameMap &inputs,
+                                  const framework::VariableNameMap &outputs,
+                                  const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto &x =
+        detail::Ref(scope.FindVar(Input("X")),
+                    "Cannot find input lod tensor variable %s", Input("X"))
+            .Get<framework::LoDTensor>();
+    auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")),
+                                   "Cannot find input rank table variable %s",
+                                   Input("RankTable"))
+                           .Get<framework::LoDRankTable>();
+    auto &out =
+        *detail::Ref(scope.FindVar(Output("Out")),
+                     "Cannot find output lod tensor variable %s", Output("Out"))
+             .GetMutable<framework::LoDTensor>();
+
+    out.Resize(x.dims());
+    out.mutable_data(x.place(), x.type());
+    this->process(place, x, rank_table, &out);
+  }
+
+ protected:
+  virtual void process(const platform::Place &place,
+                       const framework::LoDTensor &x,
+                       const framework::LoDRankTable &rank_table,
+                       framework::LoDTensor *out) const = 0;
+
+  struct AbsoluteRankTableItem {
+    size_t offset;  // the absolute/accumulated offset.
+    size_t length;  // the length
+    framework::LoD lod;
+  };
+
+  std::vector<AbsoluteRankTableItem> GetAbsoluteOffsetAndLengthByLoDRankTable(
+      const framework::LoDTensor &x) const {
+    std::vector<AbsoluteRankTableItem> absolute_table;
+
+    if (x.lod().empty()) {
+      // For Tensor without lod, such as the output of sequence_pool_op
+      size_t size = x.dims()[0];
+      absolute_table.reserve(size);
+      for (size_t i = 0; i < size; ++i) {
+        absolute_table.emplace_back();
+        absolute_table.back().length = 1;
+        absolute_table.back().offset = i;
+      }
+    } else {
+      size_t level = 0;
+      size_t size = x.lod()[level].size();
+
+      for (size_t i = 0; i < size - 1; ++i) {
+        auto lod_offset =
+            framework::GetSubLoDAndAbsoluteOffset(x.lod(), i, i + 1, level);
+
+        auto &offset = lod_offset.second;
+
+        absolute_table.emplace_back();
+        absolute_table.back().length = offset.second - offset.first;
+        absolute_table.back().offset = offset.first;
+        absolute_table.back().lod = lod_offset.first;
+      }
+    }
+
+    return absolute_table;
+  }
+
+  size_t CopyTensorAndLod(const platform::Place &place,
+                          const AbsoluteRankTableItem &item,
+                          const framework::LoDTensor &x,
+                          framework::LoDTensor *out, size_t out_offset) const {
+    auto &out_lod = *out->mutable_lod();
+    auto len = item.length;
+    auto x_offset = item.offset;
+
+    if (out_lod.empty()) {
+      for (size_t i = 0; i < item.lod.size(); ++i) {
+        out_lod.push_back(std::vector<size_t>({0}));
+      }
+    }
+
+    for (size_t i = 0; i < out_lod.size(); ++i) {
+      auto &out_v = out_lod[i];
+      auto &new_lod_v = item.lod[i];
+
+      for (auto &detail : new_lod_v) {
+        out_v.push_back(out_v.back() + detail);
+      }
+    }
+
+    auto x_sliced = x.Slice(x_offset, x_offset + len);
+    auto out_sliced = out->Slice(out_offset, out_offset + len);
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    framework::Copy(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
+    out_offset += len;
+    return out_offset;
+  }
+};
+
+class ReorderLoDTensorByRankTableOp : public ReorderLoDTensorByRankTableBase {
+ public:
+  ReorderLoDTensorByRankTableOp(const std::string &type,
+                                const framework::VariableNameMap &inputs,
+                                const framework::VariableNameMap &outputs,
+                                const framework::AttributeMap &attrs)
+      : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  void process(const platform::Place &place, const framework::LoDTensor &x,
+               const framework::LoDRankTable &rank_table,
+               framework::LoDTensor *out) const override {
+    auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
+    size_t out_offset = 0;
+    out->mutable_lod()->clear();
+    for (auto &item : rank_table.items()) {
+      PADDLE_ENFORCE_LT(item.index, absolute_table.size());
+      out_offset = CopyTensorAndLod(place, absolute_table[item.index], x, out,
+                                    out_offset);
+    }
+  }
+};
+
+class IdentityInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+  }
+};
+
+class ReorderLodTensorByRankGradOpMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("reorder_lod_tensor_by_rank_grad");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetInput("RankTable", Input("RankTable"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class ReorderLoDTensorByRankGradOp : public ReorderLoDTensorByRankTableBase {
+ public:
+  ReorderLoDTensorByRankGradOp(const std::string &type,
+                               const framework::VariableNameMap &inputs,
+                               const framework::VariableNameMap &outputs,
+                               const framework::AttributeMap &attrs)
+      : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  void process(const platform::Place &place, const framework::LoDTensor &x,
+               const framework::LoDRankTable &rank_table,
+               framework::LoDTensor *out) const override {
+    auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
+
+    // offsets = enumerate([item.index for item in rank_table.items()])
+    std::vector<std::pair<size_t, size_t>> offsets;
+    offsets.reserve(rank_table.items().size());
+    for (size_t i = 0; i < rank_table.items().size(); ++i) {
+      offsets.push_back({i, rank_table.items()[i].index});
+    }
+
+    // offsets.sort(key=lambda x: x[1])
+    std::sort(
+        offsets.begin(), offsets.end(),
+        [](const std::pair<size_t, size_t> &a,
+           const std::pair<size_t, size_t> &b) { return a.second < b.second; });
+
+    // Copy TensorAndLod
+    size_t out_offset = 0;
+    for (auto &offset : offsets) {
+      out_offset = this->CopyTensorAndLod(place, absolute_table[offset.first],
+                                          x, out, out_offset);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(reorder_lod_tensor_by_rank,
+                  ops::ReorderLoDTensorByRankTableOp,
+                  ops::ReorderLodTensorByRankGradOpMaker,
+                  ops::ReorderLoDTensorByRankTableOpProtoMaker,
+                  ops::IdentityInferShape);
+REGISTER_OPERATOR(reorder_lod_tensor_by_rank_grad,
+                  ops::ReorderLoDTensorByRankGradOp, ops::IdentityInferShape);
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b4f80cc06abaa536d1b1097850047fd370246dee
--- /dev/null
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/reshape_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ReshapeOp : public framework::OperatorWithKernel {
+ public:
+  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // input check
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReshapeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReshapeOp should not be null.");
+
+    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty.");
+    auto x_dims = ctx->GetInputDim("X");
+
+    std::vector<size_t> neg_dims_idx;
+    // set some dimension to -1 if it is unknown
+    const int unknown_size = -1;
+    for (size_t i = 0; i < shape.size(); ++i) {
+      PADDLE_ENFORCE(shape[i] > 0 || shape[i] == unknown_size,
+                     "Each dimension of Attr(shape) must be positive or %d.",
+                     unknown_size);
+      if (shape[i] == unknown_size) {
+        neg_dims_idx.push_back(i);
+        PADDLE_ENFORCE(neg_dims_idx.size() <= 1,
+                       "Only one dimension of Attr(shape) can be unknown.");
+      }
+    }
+
+    int64_t capacity =
+        std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+    int64_t in_size = framework::product(x_dims);
+    if (neg_dims_idx.size() == 1) {
+      // dim infer
+      shape[neg_dims_idx[0]] = in_size / (-capacity);
+      // recalculate capacity
+      capacity = shape[neg_dims_idx[0]] * (-capacity);
+    }
+    // capacity check
+    PADDLE_ENFORCE(capacity == in_size,
+                   "The size of Input(X) mismatches with Attr(shape).");
+    // resize output
+    std::vector<int64_t> shape_int64(shape.size(), 0);
+    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
+                   [](int a) { return static_cast<int64_t>(a); });
+    auto out_dims = framework::make_ddim(shape_int64);
+    ctx->SetOutputDim("Out", out_dims);
+    if (shape[0] == x_dims[0]) {
+      // Only pass LoD when the first dimension is equal between
+      // output and input.
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
+  }
+};
+
+class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensor of reshape operator.");
+    AddOutput("Out", "The output tensor of reshape operator.");
+    AddAttr<std::vector<int>>("shape",
+                              "(vector<int>) "
+                              "Target shape of reshape operator.");
+    AddComment(R"DOC(
+Reshape Operator.
+
+Reshape Input(X) into the shape specified by Attr(shape).
+
+An example:
+Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]]
+
+and target shape = [1, 4], the reshape operator will transform
+the tensor X into a 2-D tensor: [[1, 2, 3, 4]]
+
+One dimension in the target shape can be set -1, representing that its
+size is unknown. In this case, the real dimension will be infered from 
+the original shape of Input(X) and other dimensions in the target shape.
+)DOC");
+  }
+};
+
+class ReshapeGradOp : public framework::OperatorWithKernel {
+ public:
+  ReshapeGradOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OP(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, reshape_grad,
+            ops::ReshapeGradOp);
+REGISTER_OP_CPU_KERNEL(reshape,
+                       ops::ReshapeKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    reshape_grad, ops::ReshapeGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/reshape_op.cu b/paddle/fluid/operators/reshape_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f9ae6da29e54187b2d6aedb833a2aa4ca95cacba
--- /dev/null
+++ b/paddle/fluid/operators/reshape_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/reshape_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    reshape,
+    paddle::operators::ReshapeKernel<paddle::platform::CUDAPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    reshape_grad,
+    paddle::operators::ReshapeGradKernel<paddle::platform::CUDAPlace, float>);
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a17ba7c619490977b837c565ef1f4cc0780d5c61
--- /dev/null
+++ b/paddle/fluid/operators/reshape_op.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ReshapeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto out_dims = out->dims();
+    out->mutable_data<T>(ctx.GetPlace());
+    framework::Copy(*in, ctx.GetPlace(), ctx.device_context(), out);
+    out->Resize(out_dims);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ReshapeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    d_x->mutable_data<T>(ctx.GetPlace());
+
+    auto in_dims = d_x->dims();
+    framework::Copy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
+    d_x->Resize(in_dims);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/rmsprop_op.cc b/paddle/fluid/operators/rmsprop_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..06d3ccafefd4cc163b806aeb5d2a582c686f10cb
--- /dev/null
+++ b/paddle/fluid/operators/rmsprop_op.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/rmsprop_op.h"
+
+namespace paddle {
+namespace operators {
+
+class RmspropOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("MeanSquare"),
+                   "Input(MeanSquare) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment"),
+                   "Input(Moment) of RmspropOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(param_out) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
+                   "Output(Momentum_out) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"),
+                   "Output(MeanSquareOut) of RmspropOp should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("Grad"),
+        "Param and grad input of RmspropOp should have the same dimension.");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Moment"),
+                      "Param and Momentum input of RmspropOp "
+                      "should have the same dimension.");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("MeanSquare"),
+                      "Param and Momentum input of RmspropOp "
+                      "should have the same dimension.");
+
+    auto lr_dim = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
+                      "Learning Rate should be a scalar.");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("MomentOut", param_dim);
+    ctx->SetOutputDim("MeanSquareOut", param_dim);
+  }
+};
+
+class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RmspropOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param",
+             "(Tensor, default Tensor<float>) "
+             "Input parameter value that has to be updated.");
+    AddInput("MeanSquare",
+             "(Tensor, default Tensor<float>)"
+             " The mean square value that gets updated.");
+    AddInput("LearningRate",
+             "(Tensor, default Tensor<float>) "
+             "The learning rate should be a tensor of size 1.");
+    AddInput("Grad",
+             "(Tensor, default Tensor<float>) "
+             "Input gradient of the parameter.");
+    AddInput("Moment",
+             "(Tensor, default Tensor<float>) The moment that gets updated.");
+
+    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
+    AddOutput("MomentOut", "(Tensor) Output updated moment.");
+    AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value.");
+
+    AddAttr<float>("epsilon",
+                   "(float, default 1e-10) Constant "
+                   "for numerical stability.")
+        .SetDefault(1.0e-10f);
+    AddAttr<float>("decay",
+                   "(float, default 0.9) "
+                   "Discounting factor for coming gradient.")
+        .SetDefault(0.9f);
+    AddAttr<float>("momentum", "(float, default 0.0) Constant value.")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+Rmsprop Optimizer. 
+
+$$
+MeanSquareOut = decay * MeanSquare + (1 - decay) * Grad * Grad \\
+MomentOut = momentum * Moment +
+            \frac{LearningRate * Grad}{\sqrt{MeanSquareOut + epsilon}} \\
+ParamOut = Param -  MomentOut
+$$
+
+The original slides that proposed Rmsprop: Slide 29 of
+http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/rmsprop_op.cu b/paddle/fluid/operators/rmsprop_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a909c942791d2e2e4d9887d4c9265383a93ca137
--- /dev/null
+++ b/paddle/fluid/operators/rmsprop_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/rmsprop_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    rmsprop, ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/rmsprop_op.h b/paddle/fluid/operators/rmsprop_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..469c102a4721ca45026112e3166dc0807ba93292
--- /dev/null
+++ b/paddle/fluid/operators/rmsprop_op.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class RmspropOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param_out = ctx.Output<Tensor>("ParamOut");
+    auto* moment_out = ctx.Output<Tensor>("MomentOut");
+    auto* mean_square_out = ctx.Output<Tensor>("MeanSquareOut");
+
+    auto grad = ctx.Input<Tensor>("Grad");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+    moment_out->mutable_data<T>(ctx.GetPlace());
+    mean_square_out->mutable_data<T>(ctx.GetPlace());
+
+    float epsilon = ctx.Attr<float>("epsilon");
+    float rho = ctx.Attr<float>("decay");
+    float momentum = ctx.Attr<float>("momentum");
+
+    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
+    auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare"));
+    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
+    auto g = EigenVector<T>::Flatten(*grad);
+    auto mom = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment"));
+
+    auto p_out = EigenVector<T>::Flatten(*param_out);
+    auto mom_out = EigenVector<T>::Flatten(*moment_out);
+    auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+
+    ms_out.device(place) = rho * ms + (1 - rho) * g * g;
+    mom_out.device(place) =
+        momentum * mom +
+        lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
+    p_out.device(place) = p - mom_out;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..504456c4b069f81319893ae51f57503f5025761a
--- /dev/null
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -0,0 +1,151 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+class RNNMemoryHelperOp : public framework::OperatorBase {
+ public:
+  RNNMemoryHelperOp(const std::string &type,
+                    const framework::VariableNameMap &inputs,
+                    const framework::VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    auto mem_var_name = Input("X");
+    auto *mem_var = scope.FindVar(mem_var_name);
+    PADDLE_ENFORCE(mem_var != nullptr,
+                   "Cannot find mem_var in scope, mem_var_name is %s",
+                   mem_var_name);
+
+    auto out_name = this->Output("Out");
+    auto *out_var = scope.FindVar(out_name);
+    PADDLE_ENFORCE(out_var != nullptr,
+                   "Cannot find out_var in scope, out_var_name is %s",
+                   out_name);
+
+    auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
+    auto &mem_tensor = mem_var->Get<framework::LoDTensor>();
+    out_tensor->ShareDataWith(mem_tensor);
+    out_tensor->set_lod(mem_tensor.lod());
+  }
+};
+
+class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class RNNMemoryHelperOpInfoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RNNMemoryHelperOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "");
+    AddOutput("Out", "");
+    AddAttr<int>("dtype",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
+        .SetDefault(framework::proto::DataType::FP32);
+    AddComment("");
+  }
+};
+
+class RNNMemoryHelperGradOp : public framework::OperatorBase {
+ public:
+  RNNMemoryHelperGradOp(const std::string &type,
+                        const framework::VariableNameMap &inputs,
+                        const framework::VariableNameMap &outputs,
+                        const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    auto out_grad_var_name = Input(framework::GradVarName("Out"));
+    auto *out_grad_var = scope.FindVar(out_grad_var_name);
+
+    auto in_grad_var_name = Output(framework::GradVarName("X"));
+    auto *in_grad_var = scope.FindVar(in_grad_var_name);
+    PADDLE_ENFORCE(in_grad_var != nullptr,
+                   "Cannot find in_grad_var in scope, name is %s",
+                   in_grad_var_name);
+
+    if (out_grad_var == nullptr) {
+      VLOG(5) << "Using fill constant 0 as starting gradient";
+      auto in_var_name = Input("X");
+      auto *in_var = scope.FindVar(in_var_name);
+      auto &in_var_tensor = in_var->Get<framework::LoDTensor>();
+
+      framework::AttributeMap attrs;
+      attrs["dtype"] = framework::ToDataType(in_var_tensor.type());
+      attrs["shape"] = framework::vectorize2int(in_var_tensor.dims());
+      attrs["value"] = 0.0f;
+
+      auto zero_op = framework::OpRegistry::CreateOp(
+          "fill_constant", {}, {{"Out", {in_grad_var_name}}}, attrs);
+      zero_op->Run(scope, dev_place);
+    } else {
+      auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>();
+      auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>();
+      in_grad_tensor->ShareDataWith(out_grad_tensor);
+      in_grad_tensor->set_lod(out_grad_tensor.lod());
+    }
+  }
+};
+
+class RNNMemoryHelperGradOpInfoMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  RNNMemoryHelperGradOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(framework::GradVarName("Out"), "");
+    AddInput("X", "");
+    AddInput("Out", "");
+    AddOutput(framework::GradVarName("X"), "");
+    AddAttr<int>("dtype",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
+        .SetDefault(framework::proto::DataType::FP32);
+    AddComment("");
+  }
+};
+
+class RNNMemoryHelperGradOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    auto x_grad_name = framework::GradVarName("X");
+    PADDLE_ENFORCE(ctx->HasOutput(x_grad_name), "");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "");
+    ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ x_grad_name);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(rnn_memory_helper, paddle::operators::RNNMemoryHelperOp,
+                  paddle::operators::RNNMemoryHelperOpInfoMaker,
+                  paddle::operators::RNNMemoryHelperOpShapeInference,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(rnn_memory_helper_grad,
+                  paddle::operators::RNNMemoryHelperGradOp,
+                  paddle::operators::RNNMemoryHelperGradOpInfoMaker,
+                  paddle::operators::RNNMemoryHelperGradOpShapeInference);
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..09238f89a775979b8b1866d410e6ad1ef772f9d7
--- /dev/null
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -0,0 +1,165 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/roi_pool_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+static constexpr int kROISize = 5;
+
+class ROIPoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ROIs"),
+                   "Input(ROIs) of ROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Argmax"),
+                   "Output(Argmax) of ROIPoolOp should not be null.");
+    auto input_dims = ctx->GetInputDim("X");
+    auto rois_dims = ctx->GetInputDim("ROIs");
+
+    PADDLE_ENFORCE(input_dims.size() == 4,
+                   "The format of input tensor is NCHW.");
+    PADDLE_ENFORCE(rois_dims.size() == 2,
+                   "ROIs should be a 2-D tensor of shape (num_rois, 5)"
+                   "given as [[batch_id, x1, y1, x2, y2], …].");
+    PADDLE_ENFORCE(rois_dims[1] == kROISize,
+                   "ROIs should be a 2-D tensor of shape (num_rois, 5)"
+                   "given as [[batch_id, x1, y1, x2, y2], …].");
+
+    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
+    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
+    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
+
+    PADDLE_ENFORCE_GT(pooled_height, 0,
+                      "The pooled output height must greater than 0");
+    PADDLE_ENFORCE_GT(pooled_width, 0,
+                      "The pooled output width must greater than 0");
+    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
+                      "The spatial scale must greater than 0");
+
+    auto out_dims = input_dims;
+    out_dims[0] = rois_dims[0];
+    out_dims[1] = input_dims[1];
+    out_dims[2] = pooled_height;
+    out_dims[3] = pooled_width;
+
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->SetOutputDim("Argmax", out_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class ROIPoolGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
+                   "The gradient of X should not be null.");
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ROIPoolOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor), "
+             "the input of ROIPoolOp. "
+             "The format of input tensor is NCHW. Where N is batch size, "
+             "C is the number of input channels, "
+             "H is the height of the feature, and "
+             "W is the width of the feature.");
+    AddInput("ROIs",
+             "(Tensor), "
+             "ROIs (Regions of Interest) to pool over. "
+             "should be a 2-D tensor of shape (num_rois, 5)"
+             "given as [[batch_id, x1, y1, x2, y2], …]. "
+             "Where batch_id is the id of the data, "
+             "(x1, y1) is the top left coordinates, and "
+             "(x2, y2) is the bottom right coordinates.");
+    AddOutput("Out",
+              "(Tensor), "
+              "The output of ROIPoolOp is a 4-D tensor with shape "
+              "(num_rois, channels, pooled_h, pooled_w).");
+    AddOutput("Argmax",
+              "(Tensor), "
+              "Argmaxes corresponding to indices in X used "
+              "for gradient computation. Only output "
+              "if arg “is_test” is false.")
+        .AsIntermediate();
+    AddAttr<float>("spatial_scale",
+                   "(float, default 1.0), "
+                   "Multiplicative spatial scale factor "
+                   "to translate ROI coords from their input scale "
+                   "to the scale used when pooling.")
+        .SetDefault(1.0);
+    AddAttr<int>("pooled_height",
+                 "(int, default 1), "
+                 "The pooled output height.")
+        .SetDefault(1);
+    AddAttr<int>("pooled_width",
+                 "(int, default 1), "
+                 "The pooled output width.")
+        .SetDefault(1);
+    AddComment(R"DOC(
+ROIPool operator
+
+ROI Pooling for Faster-RCNN. The link below is a further introduction: 
+https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad,
+            ops::ROIPoolGradOp);
+REGISTER_OP_CPU_KERNEL(
+    roi_pool,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    roi_pool_grad,
+    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0e8fc9ec7a68cffeb45f8ece3e5bde39d1e71e92
--- /dev/null
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -0,0 +1,209 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/roi_pool_op.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+static constexpr int kROISize = 5;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPUROIPoolForward(const int nthreads, const T* input_data,
+                                  const int64_t* input_rois,
+                                  const float spatial_scale, const int channels,
+                                  const int height, const int width,
+                                  const int pooled_height,
+                                  const int pooled_width, T* output_data,
+                                  int64_t* argmax_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const int64_t* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = offset_input_rois[0];
+    int roi_start_w = round(offset_input_rois[1] * spatial_scale);
+    int roi_start_h = round(offset_input_rois[2] * spatial_scale);
+    int roi_end_w = round(offset_input_rois[3] * spatial_scale);
+    int roi_end_h = round(offset_input_rois[4] * spatial_scale);
+
+    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
+    int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
+    int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
+    int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
+
+    hstart = min(max(hstart + roi_start_h, 0), height);
+    hend = min(max(hend + roi_start_h, 0), height);
+    wstart = min(max(wstart + roi_start_w, 0), width);
+    wend = min(max(wend + roi_start_w, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
+    int maxidx = -1;
+    const T* offset_input_data =
+        input_data + (roi_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int input_data_index = h * width + w;
+        if (offset_input_data[input_data_index] > maxval) {
+          maxval = offset_input_data[input_data_index];
+          maxidx = input_data_index;
+        }
+      }
+    }
+    output_data[index] = maxval;
+    if (argmax_data) {
+      argmax_data[index] = maxidx;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GPUROIPoolBackward(
+    const int nthreads, const int64_t* input_rois, const T* output_grad,
+    const int64_t* argmax_data, const int num_rois, const float spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, T* input_grad) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const int64_t* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = offset_input_rois[0];
+    int input_offset = (roi_batch_ind * channels + c) * height * width;
+    int output_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_output_grad = output_grad + output_offset;
+    T* offset_input_grad = input_grad + input_offset;
+    const int64_t* offset_argmax_data = argmax_data + output_offset;
+
+    int argmax = offset_argmax_data[ph * pooled_width + pw];
+    if (argmax != -1) {
+      platform::CudaAtomicAdd(
+          offset_input_grad + argmax,
+          static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
+    }
+  }
+}
+
+template <typename Place, typename T>
+class GPUROIPoolOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    auto* rois = ctx.Input<Tensor>("ROIs");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto* argmax = ctx.Output<Tensor>("Argmax");
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    auto in_dims = in->dims();
+    auto in_stride = framework::stride(in_dims);
+    int channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+
+    size_t rois_num = rois->dims()[0];
+    if (rois_num == 0) return;
+
+    int output_size = out->numel();
+    int blocks = NumBlocks(output_size);
+    int threads = kNumCUDAThreads;
+
+    GPUROIPoolForward<
+        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        output_size, in->data<T>(), rois->data<int64_t>(), spatial_scale,
+        channels, height, width, pooled_height, pooled_width,
+        out->mutable_data<T>(ctx.GetPlace()),
+        argmax->mutable_data<int64_t>(ctx.GetPlace()));
+  }
+};
+
+template <typename Place, typename T>
+class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    auto* rois = ctx.Input<Tensor>("ROIs");
+    auto* argmax = ctx.Input<Tensor>("Argmax");
+
+    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    size_t rois_num = rois->dims()[0];
+    int channels = in->dims()[1];
+    int height = in->dims()[2];
+    int width = in->dims()[3];
+
+    if (x_grad) {
+      x_grad->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<Place, T> set_zero;
+      set_zero(ctx.cuda_device_context(), x_grad, static_cast<T>(0));
+
+      int output_grad_size = out_grad->numel();
+      int blocks = NumBlocks(output_grad_size);
+      int threads = kNumCUDAThreads;
+
+      if (output_grad_size > 0) {
+        GPUROIPoolBackward<
+            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+            output_grad_size, rois->data<int64_t>(), out_grad->data<T>(),
+            argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
+            width, pooled_height, pooled_width,
+            x_grad->mutable_data<T>(ctx.GetPlace()));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    roi_pool,
+    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    roi_pool_grad,
+    ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..15f3b36fcd16bf72b9b09f58a3019b24538eec12
--- /dev/null
+++ b/paddle/fluid/operators/roi_pool_op.h
@@ -0,0 +1,184 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class CPUROIPoolOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::Tensor>("ROIs");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* argmax = ctx.Output<framework::Tensor>("Argmax");
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    auto in_dims = in->dims();
+    int batch_size = in_dims[0];
+    int channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int rois_num = rois->dims()[0];
+
+    auto in_stride = framework::stride(in_dims);
+    auto argmax_stride = framework::stride(argmax->dims());
+    auto roi_stride = framework::stride(rois->dims());
+    auto out_stride = framework::stride(out->dims());
+
+    const T* input_data = in->data<T>();
+    const int64_t* rois_data = rois->data<int64_t>();
+    T* output_data = out->mutable_data<T>(ctx.GetPlace());
+    int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
+
+    for (int n = 0; n < rois_num; ++n) {
+      int roi_batch_id = rois_data[0];
+      PADDLE_ENFORCE_GE(roi_batch_id, 0);
+      PADDLE_ENFORCE_LT(roi_batch_id, batch_size);
+      rois_data += roi_stride[0];
+    }
+
+    rois_data = rois->data<int64_t>();
+    for (int n = 0; n < rois_num; ++n) {
+      int roi_batch_id = rois_data[0];
+      int roi_start_w = round(rois_data[1] * spatial_scale);
+      int roi_start_h = round(rois_data[2] * spatial_scale);
+      int roi_end_w = round(rois_data[3] * spatial_scale);
+      int roi_end_h = round(rois_data[4] * spatial_scale);
+
+      // Force malformed ROIs to be 1x1
+      int roi_height = std::max(roi_end_h - roi_start_h + 1, 1);
+      int roi_width = std::max(roi_end_w - roi_start_w + 1, 1);
+
+      const float bin_size_h =
+          static_cast<float>(roi_height) / static_cast<float>(pooled_height);
+      const float bin_size_w =
+          static_cast<float>(roi_width) / static_cast<float>(pooled_width);
+
+      const T* batch_data = input_data + roi_batch_id * in_stride[0];
+
+      for (int c = 0; c < channels; ++c) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            //  Compute pooling region for this output unit:
+            //  start (included) = floor(ph * roi_height / pooled_height_)
+            //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
+            int hstart =
+                static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
+            int wstart =
+                static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
+            int hend =
+                static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
+            int wend =
+                static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));
+
+            hstart = std::min(std::max(hstart + roi_start_h, 0), height);
+            hend = std::min(std::max(hend + roi_start_h, 0), height);
+            wstart = std::min(std::max(wstart + roi_start_w, 0), width);
+            wend = std::min(std::max(wend + roi_start_w, 0), width);
+
+            const int pool_index = ph * pooled_width + pw;
+
+            // Define an empty pooling region to be zero
+            bool is_empty = (hend <= hstart) || (wend <= wstart);
+            output_data[pool_index] =
+                is_empty ? 0 : -std::numeric_limits<T>::max();
+            argmax_data[pool_index] = -1;
+
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                const int index = h * width + w;
+                if (batch_data[index] > output_data[pool_index]) {
+                  output_data[pool_index] = batch_data[index];
+                  argmax_data[pool_index] = index;
+                }
+              }
+            }
+          }
+        }
+
+        batch_data += in_stride[1];
+        output_data += out_stride[1];
+        argmax_data += argmax_stride[1];
+      }
+      // Increment ROI data pointer
+      rois_data += roi_stride[0];
+    }
+    return;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::Tensor>("ROIs");
+    auto* argmax = ctx.Input<framework::Tensor>("Argmax");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+
+    if (in_grad) {
+      const int64_t* rois_data = rois->data<int64_t>();
+      const T* out_grad_data = out_grad->data<T>();
+      const int64_t* argmax_data = argmax->data<int64_t>();
+      T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(ctx.template device_context<DeviceContext>(), in_grad,
+               static_cast<T>(0));
+
+      auto in_stride = framework::stride(in->dims());
+      auto argmax_stride = framework::stride(argmax->dims());
+      auto roi_stride = framework::stride(rois->dims());
+      auto out_stride = framework::stride(out_grad->dims());
+
+      int rois_num = rois->dims()[0];
+      int channels = in->dims()[1];
+
+      for (int n = 0; n < rois_num; ++n) {
+        int roi_batch_idx = rois_data[0];
+        T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0];
+        for (int c = 0; c < channels; ++c) {
+          for (int ph = 0; ph < pooled_height; ++ph) {
+            for (int pw = 0; pw < pooled_width; ++pw) {
+              int pool_index = ph * pooled_width + pw;
+              if (argmax_data[pool_index] >= 0) {
+                auto index = argmax_data[pool_index];
+                batch_grad_data[index] += out_grad_data[pool_index];
+              }
+            }
+          }
+          batch_grad_data += in_stride[1];
+          out_grad_data += out_stride[1];
+          argmax_data += argmax_stride[1];
+        }
+        rois_data += roi_stride[0];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..92661ea9716a89a66c27fa21543d81b5a280bcdd
--- /dev/null
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -0,0 +1,259 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/row_conv_op.h"
+#include "paddle/fluid/framework/eigen.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+class RowConvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of RowConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                   "Input(Filter) of RowConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of RowConvOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto filter_dims = ctx->GetInputDim("Filter");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(filter_dims.size(), 2, "Input(Y)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(
+        x_dims[1], filter_dims[1],
+        "The 2nd dimension of Input(X) and Input(Filter) should be same.");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+class RowConvGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                   "Input(Filter) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Gradient of output(Out) should not be null.");
+
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      auto x_dims = ctx->GetInputDim("X");
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+
+    auto filter_grad_name = framework::GradVarName("Filter");
+    if (ctx->HasOutput(filter_grad_name)) {
+      auto filter_dims = ctx->GetInputDim("Filter");
+      ctx->SetOutputDim(filter_grad_name, filter_dims);
+    }
+  }
+};
+
+class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RowConvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor), the input(X) is a LodTensor, which supports "
+             "variable time-length input sequences. The underlying tensor "
+             "in this LoDTensor is a matrix with shape (T x N), where T "
+             "is the total time steps in this mini-batch and N is the input "
+             "data dimension.");
+    AddInput("Filter",
+             "(Tensor), the input(Filter) is a learnable parameter. It "
+             "is a 2-D tensor with shape (future_context x N), where, "
+             "future_context is the future context length and N is the data "
+             "dimension.");
+    AddOutput("Out",
+              "(LoDTensor), the output(Out) is a LodTensor, which supports "
+              "variable time-length input sequences. The underlying tensor "
+              "in this LodTensor is a matrix with shape T x N, i.e., the "
+              "same shape as X.");
+    AddComment(R"DOC(
+Row-convolution Operator.
+
+The row convolution is called lookahead convolution.  This operator was 
+introduced in the following paper for DeepSpeech2:
+http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf 
+
+The main motivation is that a bidirectional RNN, useful in DeepSpeech 
+like speech models, learns representation for a sequence by performing a 
+forward and a backward pass through the entire sequence. However, unlike 
+unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online
+and low-latency setting. The lookahead convolution incorporates information 
+from future subsequences in a computationally efficient manner to improve 
+unidirectional recurrent neural networks. The row convolution operator is 
+different from the 1D sequence convolution, and is computed as follows:
+
+Given an input sequence $in$ of length $t$ and input dimension $d$, 
+and a filter ($W$) of size $context \times d$, 
+the output sequence is convolved as:
+
+$$
+out_{i, :} = \sum_{j=i}^{i + context} in_{j,:} \dot W_{i-j, :}
+$$
+
+)DOC");
+  }
+};
+
+template <typename T>
+class RowConvKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<LoDTensor>("X");
+    auto *filter = context.Input<Tensor>("Filter");
+    auto *out = context.Output<LoDTensor>("Out");
+
+    out->mutable_data<T>(context.GetPlace());
+
+    auto batch_indices = x->lod()[0];
+    auto input_dim = x->dims()[1];  // 'in' is of size T x N
+    size_t num_sequence = batch_indices.size() - 1;
+
+    auto future_context = filter->dims()[0];
+    auto weights = EigenMatrix<T>::From(*filter);
+
+    for (size_t i = 0; i < num_sequence; i++) {
+      int start = static_cast<int>(batch_indices[i]);
+      int end = static_cast<int>(batch_indices[i + 1]);
+      int current_timesteps = end - start;
+      Tensor cur_input_sequence =
+          x->Slice(start, end);  // Current input sequence
+      Tensor cur_output_sequence =
+          out->Slice(start, end);  // Current output sequence
+      auto cip_seq = EigenMatrix<T>::From(cur_input_sequence);
+      auto cot_seq = EigenMatrix<T>::From(cur_output_sequence);
+
+      for (int k = 0; k < current_timesteps;
+           k++) {  // For different time steps in the same sequence
+        for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+             w++) {
+          for (int d = 0; d < input_dim; d++) {
+            if (w == 0) {
+              cot_seq(k, d) = weights(w, d) * cip_seq(k + w, d);
+            } else {
+              cot_seq(k, d) += weights(w, d) * cip_seq(k + w, d);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class RowConvGradKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<LoDTensor>("X");
+    auto *filter = context.Input<Tensor>("Filter");
+    auto *d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto *dx = context.Output<LoDTensor>(framework::GradVarName("X"));
+    auto *d_filter = context.Output<Tensor>(framework::GradVarName("Filter"));
+
+    auto input_dim = x->dims()[1];  // 'x' is of size T x N
+    auto batch_indices = x->lod()[0];
+    size_t num_sequence = batch_indices.size() - 1;
+    auto future_context = filter->dims()[0];
+
+    if (d_filter) {
+      d_filter->mutable_data<T>(context.GetPlace());
+      auto dweights =
+          EigenMatrix<T>::From(*d_filter);  // Gradient of weight matrix
+      dweights.setZero();
+
+      for (size_t i = 0; i < num_sequence; i++) {  // For different sequences
+        int start = static_cast<int>(batch_indices[i]);
+        int end = static_cast<int>(batch_indices[i + 1]);
+
+        Tensor cur_input = x->Slice(start, end);  // Current input sequence
+        Tensor cur_doutput =
+            d_out->Slice(start, end);  // Current output grad sequence
+
+        auto cur_ip = EigenMatrix<T>::From(cur_input);
+        auto cur_dout = EigenMatrix<T>::From(cur_doutput);
+        int current_timesteps = end - start;
+
+        for (int k = 0; k < current_timesteps;
+             k++) {  // For different time steps in the same sequence
+          for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+               w++) {
+            // For dweights (Updating the gradient of weight matrix)
+            for (int d = 0; d < input_dim; d++) {
+              dweights(w, d) += cur_ip(k + w, d) * cur_dout(k, d);
+            }
+          }
+        }
+      }
+    }
+
+    if (dx) {
+      dx->mutable_data<T>(context.GetPlace());
+      auto weights = EigenMatrix<T>::From(*filter);
+      for (size_t i = 0; i < num_sequence; i++) {  // For different sequences
+        int start = static_cast<int>(batch_indices[i]);
+        int end = static_cast<int>(batch_indices[i + 1]);
+
+        Tensor cur_doutput =
+            d_out->Slice(start, end);  // Current output grad sequence
+        Tensor cur_dinput =
+            dx->Slice(start, end);  // Current input grad sequence
+
+        auto cur_dout = EigenMatrix<T>::From(cur_doutput);
+        auto cur_dip = EigenMatrix<T>::From(cur_dinput);
+        cur_dip.setZero();
+        int current_timesteps = end - start;
+
+        for (int k = 0; k < current_timesteps;
+             k++) {  // For different time steps in the same sequence
+          for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+               w++) {
+            // For dinput (Updating the gradient wrt input)
+            for (int d = 0; d < input_dim; d++) {
+              cur_dip(k + w, d) += weights(w, d) * cur_dout(k, d);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(row_conv, ops::RowConvOp, ops::RowConvOpMaker, row_conv_grad,
+            ops::RowConvGradOp);
+REGISTER_OP_CPU_KERNEL(
+    row_conv, ops::RowConvKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    row_conv_grad,
+    ops::RowConvGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..832072edf810099d142c82930abfd7f198a7d1b8
--- /dev/null
+++ b/paddle/fluid/operators/row_conv_op.cu
@@ -0,0 +1,410 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/row_conv_op.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using framework::Tensor;
+
+namespace {
+
+inline int DivUp(int x, int y) { return (x + y - 1) / y; }
+
+// Forward prop (shared memory version, for small future_context)
+template <typename T>
+__global__ void RowConvForwardSharedMemory(const T *in, const T *wt,
+                                           int num_sequence, int input_dim,
+                                           int future_context,
+                                           const size_t *batch_indices,
+                                           T *out) {
+  int blx = blockDim.x;
+  int bly = blockDim.y;
+  int thx = threadIdx.x;
+  int thy = threadIdx.y;
+  int d = blockIdx.x * blx + thx;  // index along input dim
+
+  extern __shared__ T mem[];
+  T *sw = mem;
+
+  if (thy < future_context) {
+    sw[thy * blx + thx] =
+        (d < input_dim) ? wt[thy * input_dim + d] : static_cast<T>(0);
+  }
+  __syncthreads();
+
+  for (size_t i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    for (int k = thy; k < current_timesteps; k += bly) {
+      T sum = 0;
+      for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+           w++) {
+        sum += (d < input_dim)
+                   ? sw[w * blx + thx] * in[(start + k + w) * input_dim + d]
+                   : static_cast<T>(0);
+      }
+      if (d < input_dim) {
+        out[(start + k) * input_dim + d] = sum;
+      }
+    }
+  }
+}
+
+// Forward prop (naive version)
+template <typename T>
+__global__ void RowConvForward(const T *in, const T *wt, int num_sequence,
+                               int input_dim, int future_context,
+                               const size_t *batch_indices, T *out) {
+  int d = blockIdx.x * blockDim.x + threadIdx.x;  // index along input_dim
+  int bly = blockDim.y;
+  int thy = threadIdx.y;
+
+  if (d >= input_dim) return;
+
+  for (size_t i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    for (int k = thy; k < current_timesteps; k += bly) {
+      T sum = 0;
+      for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+           w++) {
+        sum += (wt[w * input_dim + d] * in[(start + k + w) * input_dim + d]);
+      }
+      out[(start + k) * input_dim + d] = sum;
+    }
+  }
+}
+
+// Compute input gradient (shared memory version, for small future_context)
+template <typename T>
+__global__ void RowConvGradInputSharedMemory(const T *dout, const T *wt,
+                                             int num_sequence, int input_dim,
+                                             int future_context,
+                                             const size_t *batch_indices,
+                                             T *din) {
+  int blx = blockDim.x;
+  int bly = blockDim.y;
+  int thx = threadIdx.x;
+  int thy = threadIdx.y;
+  int d = blockIdx.x * blx + thx;  // index along input dim
+
+  extern __shared__ T mem[];
+  T *sw = mem;
+  if (thy < future_context) {
+    sw[thy * blx + thx] =
+        (d < input_dim) ? wt[thy * input_dim + d] : static_cast<T>(0);
+  }
+  __syncthreads();
+
+  for (int i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    for (int k = thy; k < current_timesteps; k += bly) {
+      T sum = 0;
+      for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) {
+        sum += (d < input_dim)
+                   ? (sw[w * blx + thx] * dout[(k + start - w) * input_dim + d])
+                   : static_cast<T>(0);
+      }
+      if (d < input_dim) {
+        din[(k + start) * input_dim + d] = sum;
+      }
+    }
+  }
+}
+
+// Compute input gradient (Naive version)
+template <typename T>
+__global__ void RowConvGradInput(const T *dout, const T *wt, int num_sequence,
+                                 int input_dim, int future_context,
+                                 const size_t *batch_indices, T *din) {
+  int d = blockIdx.x * blockDim.x + threadIdx.x;  // index along input_dim
+  int bly = blockDim.y;
+  int thy = threadIdx.y;
+
+  if (d >= input_dim) return;
+  for (int i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    for (int k = thy; k < current_timesteps; k += bly) {
+      T sum = 0;
+      for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) {
+        sum += (wt[w * input_dim + d] * dout[(k + start - w) * input_dim + d]);
+      }
+      din[(k + start) * input_dim + d] = sum;
+    }
+  }
+}
+
+// Compute W gradient (small future_context version)
+template <typename T>
+__global__ void RowConvGradFilterImproved(const T *in, const T *dout,
+                                          int num_sequence, int input_dim,
+                                          int future_context, int block_x,
+                                          int block_y,
+                                          const size_t *batch_indices,
+                                          T *dfilter) {
+  int blx = blockDim.x;
+  int bly = blockDim.y;
+  int thx = threadIdx.x;
+  int thy = threadIdx.y;
+  int gx = blockIdx.x * blx;
+  int d = gx + thx;  // index along input dim
+
+  extern __shared__ T mem[];
+
+  int xdim_sh_in = block_y;
+  int xdim_sh_dout = block_y;
+  // int xdim_sh_dfilter = future_context;
+  int ydim_sh_in = block_x;
+  int ydim_sh_dout = block_x + future_context - 1;
+  int ydim_sh_dfilter = block_y;
+
+  T *sh_in = mem;
+  T *sh_dout = &mem[xdim_sh_in * ydim_sh_in];
+  T *sh_dfilter = &mem[xdim_sh_in * ydim_sh_in + xdim_sh_dout * ydim_sh_dout];
+
+  if (thy < future_context) {
+    sh_dfilter[thy * ydim_sh_dfilter + thx] = static_cast<T>(0);
+  }
+  __syncthreads();
+
+  for (int i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    int scaled_cur_steps =
+        ((current_timesteps + block_x - 1) / block_x) * block_x;
+
+    for (int k = thy; k < scaled_cur_steps; k += block_x) {
+      int pos = start + k;
+      sh_in[thx * ydim_sh_in + thy] =
+          (d < input_dim && pos < end) ? in[pos * input_dim + d] : T(0);
+      sh_dout[thx * ydim_sh_dout + thy + future_context - 1] =
+          (d < input_dim && pos < end) ? dout[pos * input_dim + d] : T(0);
+      __syncthreads();
+
+      if (thy < future_context - 1) {
+        int pos_offset = pos - future_context + 1;
+        sh_dout[thx * ydim_sh_dout + thy] =
+            (d < input_dim && pos_offset >= start)
+                ? dout[pos_offset * input_dim + d]
+                : T(0);
+      }
+      __syncthreads();
+
+      for (int w = 0; w < future_context; w++) {
+        T val = sh_in[thy * ydim_sh_in + thx] *
+                sh_dout[thy * ydim_sh_dout + thx + future_context - 1 - w];
+        __syncthreads();
+
+        for (int offset = 16; offset > 0;
+             offset = offset / 2) {  // blockDim.x is 32.
+          val += __shfl_down(val, offset);
+        }
+        __syncthreads();
+
+        if (thx == 0) {
+          sh_dfilter[w * ydim_sh_dfilter + thy] += val;
+        }
+        __syncthreads();
+      }
+    }
+  }
+  for (int w = thy; (w < future_context) && (d < input_dim); w += bly) {
+    dfilter[w * input_dim + d] += sh_dfilter[w * ydim_sh_dfilter + thx];
+  }
+}
+
+// Compute weight(filter) gradient
+template <typename T>
+__global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence,
+                                  int input_dim, int future_context,
+                                  int block_x, int block_y,
+                                  const size_t *batch_indices, T *dfilter) {
+  int blx = blockDim.x;
+  int thx = threadIdx.x;
+  int thy = threadIdx.y;
+  int gx = blockIdx.x * blx;
+  int d = gx + thx;  // index along input dim
+  extern __shared__ T mem[];
+  T *sh_in = mem;
+  T *sh_dout = &mem[block_x * block_y];
+
+  for (int i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    int scaled_cur_steps =
+        ((current_timesteps + block_x - 1) / block_x) * block_x;
+
+    for (int k = thy; k < scaled_cur_steps; k += block_x) {
+      int pos = start + k;
+      sh_in[thx * block_y + thy] =
+          (d < input_dim && pos < end) ? in[pos * input_dim + d] : 0.0;
+      __syncthreads();
+
+      for (int w = 0; w < future_context; w++) {
+        sh_dout[thx * block_y + thy] =
+            (d < input_dim && (k - w) >= 0 && (k - w) < current_timesteps)
+                ? dout[(pos - w) * input_dim + d]
+                : 0.0;
+        __syncthreads();
+
+        T val = sh_in[thy * block_y + thx] * sh_dout[thy * block_y + thx];
+        __syncthreads();
+
+        for (int offset = 16; offset > 0;
+             offset = offset / 2) {  // blockDim.x is 32.
+          val += __shfl_down(val, offset);
+        }
+        __syncthreads();
+
+        if (thx == 0 && (gx + thy) < input_dim) {
+          dfilter[w * input_dim + gx + thy] += val;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace
+
+template <typename T>
+class RowConvKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *X = context.Input<LoDTensor>("X");
+    auto *Filter = context.Input<Tensor>("Filter");
+    auto *Out = context.Output<LoDTensor>("Out");
+
+    const T *in = X->data<T>();
+    const T *weight = Filter->data<T>();
+    T *out = Out->mutable_data<T>(context.GetPlace());
+
+    auto batch_indices = X->lod()[0];
+    int input_dim = X->dims()[1];
+    int num_sequence = batch_indices.size() - 1;
+    int future_context = Filter->dims()[0];
+    size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
+    auto stream = context.cuda_device_context().stream();
+
+    if (future_context <= 32) {
+      dim3 block_dim = dim3(32, 32);
+      dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+      int mem_per_block = (future_context * block_dim.x) * sizeof(T);
+      RowConvForwardSharedMemory<
+          T><<<grid_dim, block_dim, mem_per_block, stream>>>(
+          in, weight, num_sequence, input_dim, future_context, idx, out);
+    } else {
+      dim3 block_dim = dim3(32, 32);
+      dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+      RowConvForward<T><<<grid_dim, block_dim, 0, stream>>>(
+          in, weight, num_sequence, input_dim, future_context, idx, out);
+    }
+  }
+};
+
+template <typename T>
+class RowConvGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *X = context.Input<LoDTensor>("X");
+    auto *Filter = context.Input<Tensor>("Filter");
+    auto *dOut = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    const T *in = X->data<T>();
+    const T *weights = Filter->data<T>();
+    const T *dout = dOut->data<T>();
+
+    Tensor *dX = context.Output<LoDTensor>(framework::GradVarName("X"));
+    Tensor *dFilter = context.Output<Tensor>(framework::GradVarName("Filter"));
+
+    auto batch_indices = X->lod()[0];
+    int input_dim = X->dims()[1];
+    int num_sequence = batch_indices.size() - 1;
+    int future_context = Filter->dims()[0];
+    size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
+
+    auto &device_ctx = context.cuda_device_context();
+    math::SetConstant<platform::CUDADeviceContext, T> zero;
+
+    if (dFilter) {
+      T *dfilter = dFilter->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, dFilter, static_cast<T>(0.0));
+
+      if (future_context <= 32) {
+        dim3 block_dim = dim3(32, 32);
+        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+        int block_x = block_dim.x;
+        int block_y = block_dim.y;
+        int mem_per_block =
+            (block_y * block_x + block_y * (block_x + future_context - 1) +
+             future_context * block_y) *
+            sizeof(T);
+        RowConvGradFilterImproved<
+            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+            in, dout, num_sequence, input_dim, future_context, block_x, block_y,
+            idx, dfilter);
+      } else {
+        dim3 block_dim = dim3(32, 32);
+        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+        int block_x = block_dim.x;
+        int block_y = block_dim.y;
+        int mem_per_block =
+            (block_x * block_y * 2) * sizeof(T);  // For 2 arrays of size 32x32
+        RowConvGradFilter<
+            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+            in, dout, num_sequence, input_dim, future_context, block_x, block_y,
+            idx, dfilter);
+      }
+    }
+
+    if (dX) {
+      T *din = dX->mutable_data<T>(context.GetPlace());
+      if (future_context <= 32) {
+        dim3 block_dim = dim3(32, 32);
+        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+        int mem_per_block = (future_context * block_dim.x) * sizeof(T);
+        RowConvGradInputSharedMemory<
+            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+            dout, weights, num_sequence, input_dim, future_context, idx, din);
+      } else {
+        dim3 block_dim = dim3(32, 32);
+        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+        RowConvGradInput<T><<<grid_dim, block_dim, 0, device_ctx.stream()>>>(
+            dout, weights, num_sequence, input_dim, future_context, idx, din);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    row_conv, ops::RowConvKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    row_conv_grad,
+    ops::RowConvGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/row_conv_op.h b/paddle/fluid/operators/row_conv_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..59164b5215910630b4641501bc0b0c0e941911c2
--- /dev/null
+++ b/paddle/fluid/operators/row_conv_op.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class RowConvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override;
+};
+
+template <typename DeviceContext, typename T>
+class RowConvGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override;
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c23de9073ef965b989e98936b2dd07fc6bce7fdc
--- /dev/null
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdint.h>
+#include <sys/stat.h>
+#include <fstream>
+#include <numeric>
+#include <sstream>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+// TODO(sidgoyal78): These function are needed by other files (save_op), move
+// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
+constexpr char kSEP = '/';
+static bool FileExists(const std::string &filepath) {
+  struct stat buffer;
+  return (stat(filepath.c_str(), &buffer) == 0);
+}
+
+static std::string DirName(const std::string &filepath) {
+  auto pos = filepath.rfind(kSEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static void MkDir(const char *path) {
+  if (mkdir(path, 0755)) {
+    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
+  }
+}
+
+static void MkDirRecursively(const char *fullpath) {
+  if (*fullpath == '\0') return;  // empty string
+  if (FileExists(fullpath)) return;
+
+  MkDirRecursively(DirName(fullpath).c_str());
+  MkDir(fullpath);
+}
+
+class SaveCombineOp : public framework::OperatorBase {
+ public:
+  SaveCombineOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto filename = Attr<std::string>("file_path");
+    auto overwrite = Attr<bool>("overwrite");
+
+    bool is_present = FileExists(filename);
+    if (is_present && !overwrite) {
+      PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
+                   filename, overwrite);
+    }
+
+    MkDirRecursively(DirName(filename).c_str());
+    std::ofstream fout(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+
+    auto inp_var_names = Inputs("X");
+    PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
+                      "The number of input variables should be greater than 0");
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    for (size_t i = 0; i < inp_var_names.size(); i++) {
+      auto *var = scope.FindVar(inp_var_names[i]);
+
+      PADDLE_ENFORCE(var != nullptr,
+                     "Cannot find variable %s for save_combine_op",
+                     inp_var_names[i]);
+      PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
+                     "SaveCombineOp only supports LoDTensor, %s has wrong type",
+                     inp_var_names[i]);
+
+      auto &tensor = var->Get<framework::LoDTensor>();
+      // Serialize tensor
+      framework::SerializeToStream(fout, tensor, dev_ctx);
+    }
+    fout.close();
+  }
+};
+
+class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SaveCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(vector) Input LoDTensors that need to be saved together in a file.")
+        .AsDuplicable();
+    AddComment(R"DOC(
+SaveCombine operator
+
+This operator will serialize and write a list of input LoDTensor variables 
+to a file on disk.
+)DOC");
+    AddAttr<bool>("overwrite",
+                  "(boolean, default true)"
+                  "Overwrite the output file if it exists.")
+        .SetDefault(true);
+    AddAttr<std::string>(
+        "file_path",
+        "(string)"
+        "The \"file_path\" where the LoDTensor variables will be saved.")
+        .AddCustomChecker(
+            [](const std::string &path) { return !path.empty(); });
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(save_combine, ops::SaveCombineOp,
+                  ops::SaveCombineOpProtoMaker);
diff --git a/paddle/fluid/operators/save_load_combine_op_test.cc b/paddle/fluid/operators/save_load_combine_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f8325bac6bc59602e81d38cb857b7b8e133be2cc
--- /dev/null
+++ b/paddle/fluid/operators/save_load_combine_op_test.cc
@@ -0,0 +1,180 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+USE_NO_KERNEL_OP(save_combine);
+USE_NO_KERNEL_OP(load_combine);
+
+int* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
+                            std::string var_name,
+                            paddle::platform::CPUPlace& place,
+                            paddle::framework::Scope& scope,
+                            paddle::framework::LoD& expect_lod) {
+  auto var = scope.Var(var_name);
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({x, y});
+  expect_lod.resize(1);
+  for (size_t i = 0; i < lod_info.size(); i++) {
+    expect_lod[0].push_back(lod_info[i]);
+  }
+  tensor->set_lod(expect_lod);
+  int* expect = tensor->mutable_data<int>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<int>(i);
+  }
+  return expect;
+}
+
+paddle::framework::LoDTensor* GeneratePlaceholderBeforeLoad(
+    const std::string out_var_name, paddle::framework::Scope& scope) {
+  auto load_var = scope.Var(out_var_name);
+  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
+  return target;
+}
+
+int* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target,
+                                 paddle::framework::Scope& scope,
+                                 paddle::framework::LoD& actual_lod) {
+  int* actual = target->data<int>();
+  actual_lod = target->lod();
+  return actual;
+}
+
+void CheckValues(int* expect, int* actual, paddle::framework::LoD expect_lod,
+                 paddle::framework::LoD actual_lod, const int& numel) {
+  for (int64_t i = 0; i < numel; ++i) {
+    EXPECT_EQ(expect[i], actual[i]);
+  }
+  EXPECT_EQ(expect_lod.size(), actual_lod.size());
+  for (size_t i = 0; i < expect_lod.size(); ++i) {
+    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
+      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
+    }
+  }
+}
+
+// Here, we create 4 LoDTensors and use save_combine_op to first save these
+// in a single file. Then, we use load_combine_op to load these sequentially
+TEST(SaveLoadCombineOp, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  std::vector<int> lod1 = {0, 1, 2, 3, 10};
+  int numel1 = 100;
+  paddle::framework::LoD expect_lod1;
+  int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place, scope,
+                                        expect_lod1);
+
+  std::vector<int> lod2 = {0, 2, 5, 10};
+  int numel2 = 200;
+  paddle::framework::LoD expect_lod2;
+  int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place, scope,
+                                        expect_lod2);
+
+  std::vector<int> lod3 = {0, 2, 3, 20};
+  int numel3 = 4000;
+  paddle::framework::LoD expect_lod3;
+  int* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", place,
+                                        scope, expect_lod3);
+
+  std::vector<int> lod4 = {0, 1, 20};
+  int numel4 = 1000;
+  paddle::framework::LoD expect_lod4;
+  int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place, scope,
+                                        expect_lod4);
+
+  // Set attributes
+  std::string filename = "check_tensor.ls";
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string(filename)});
+
+  // Run the save_combine_op
+  auto save_combine_op = paddle::framework::OpRegistry::CreateOp(
+      "save_combine",
+      {{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs);
+  save_combine_op->Run(scope, place);
+
+  // Set up output vars
+  auto target1 = GeneratePlaceholderBeforeLoad("out_var1", scope);
+  auto target2 = GeneratePlaceholderBeforeLoad("out_var2", scope);
+  auto target3 = GeneratePlaceholderBeforeLoad("out_var3", scope);
+  auto target4 = GeneratePlaceholderBeforeLoad("out_var4", scope);
+
+  // Run the load_combine_op
+  auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
+      "load_combine", {},
+      {{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs);
+  load_combine_op->Run(scope, place);
+
+  paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
+  int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, actual_lod1);
+  int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, actual_lod2);
+  int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, actual_lod3);
+  int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, actual_lod4);
+
+  CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1);
+  CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2);
+  CheckValues(expect3, actual3, expect_lod3, actual_lod3, numel3);
+  CheckValues(expect4, actual4, expect_lod4, actual_lod4, numel4);
+}
+
+// Test with original SaveLoadTest
+TEST(SaveLoadTestWithCombineOp, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  auto var = scope.Var("test_var");
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({3, 10});
+  paddle::framework::LoD expect_lod;
+  expect_lod.resize(1);
+  expect_lod[0].push_back(0);
+  expect_lod[0].push_back(1);
+  expect_lod[0].push_back(2);
+  expect_lod[0].push_back(3);
+
+  tensor->set_lod(expect_lod);
+  int* expect = tensor->mutable_data<int>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<int>(i);
+  }
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string("check_t.save")});
+
+  auto save_op = paddle::framework::OpRegistry::CreateOp(
+      "save_combine", {{"X", {"test_var"}}}, {}, attrs);
+  save_op->Run(scope, place);
+
+  auto load_var = scope.Var("out_var");
+  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
+  auto load_op = paddle::framework::OpRegistry::CreateOp(
+      "load_combine", {}, {{"Out", {"out_var"}}}, attrs);
+  load_op->Run(scope, place);
+  int* actual = target->data<int>();
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    EXPECT_EQ(expect[i], actual[i]);
+  }
+  auto& actual_lod = target->lod();
+  EXPECT_EQ(expect_lod.size(), actual_lod.size());
+  for (size_t i = 0; i < expect_lod.size(); ++i) {
+    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
+      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
+    }
+  }
+}
diff --git a/paddle/fluid/operators/save_load_op_test.cc b/paddle/fluid/operators/save_load_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da4573a8ed936cf607123590ca41fb8f630930f3
--- /dev/null
+++ b/paddle/fluid/operators/save_load_op_test.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+USE_NO_KERNEL_OP(save);
+USE_NO_KERNEL_OP(load);
+
+TEST(SaveLoadOp, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  auto var = scope.Var("test_var");
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({3, 10});
+  paddle::framework::LoD expect_lod;
+  expect_lod.resize(1);
+  expect_lod[0].push_back(0);
+  expect_lod[0].push_back(1);
+  expect_lod[0].push_back(2);
+  expect_lod[0].push_back(3);
+
+  tensor->set_lod(expect_lod);
+  int* expect = tensor->mutable_data<int>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<int>(i);
+  }
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string("tensor.save")});
+
+  auto save_op = paddle::framework::OpRegistry::CreateOp(
+      "save", {{"X", {"test_var"}}}, {}, attrs);
+  save_op->Run(scope, place);
+
+  auto load_var = scope.Var("out_var");
+  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
+  auto load_op = paddle::framework::OpRegistry::CreateOp(
+      "load", {}, {{"Out", {"out_var"}}}, attrs);
+  load_op->Run(scope, place);
+  int* actual = target->data<int>();
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    EXPECT_EQ(expect[i], actual[i]);
+  }
+  auto& actual_lod = target->lod();
+  EXPECT_EQ(expect_lod.size(), actual_lod.size());
+  for (size_t i = 0; i < expect_lod.size(); ++i) {
+    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
+      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
+    }
+  }
+}
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..483cdfa4c3b9e3b9abd3f32bc5e6e5e0b493bd23
--- /dev/null
+++ b/paddle/fluid/operators/save_op.cc
@@ -0,0 +1,128 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdint.h>
+#include <sys/stat.h>
+#include <fstream>
+#include <numeric>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+// TODO(yuyang18): If the functions below are needed by other files, move them
+// to paddle::filesystem namespace.
+constexpr char kSEP = '/';
+static bool FileExists(const std::string &filepath) {
+  struct stat buffer;
+  return (stat(filepath.c_str(), &buffer) == 0);
+}
+
+static std::string DirName(const std::string &filepath) {
+  auto pos = filepath.rfind(kSEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static void MkDir(const char *path) {
+  if (mkdir(path, 0755)) {
+    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
+  }
+}
+
+static void MkDirRecursively(const char *fullpath) {
+  if (*fullpath == '\0') return;  // empty string
+  if (FileExists(fullpath)) return;
+
+  MkDirRecursively(DirName(fullpath).c_str());
+  MkDir(fullpath);
+}
+
+class SaveOp : public framework::OperatorBase {
+ public:
+  SaveOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto filename = Attr<std::string>("file_path");
+    auto overwrite = Attr<bool>("overwrite");
+
+    if (FileExists(filename) && !overwrite) {
+      PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
+                   filename, overwrite);
+    }
+
+    MkDirRecursively(DirName(filename).c_str());
+
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
+    std::ofstream fout(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+
+    auto iname = Input("X");
+    auto *var = scope.FindVar(iname);
+    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op",
+                   iname);
+
+    PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
+                   "SaveOp only support LoDTensor, %s has wrong type", iname);
+
+    auto &tensor = var->Get<framework::LoDTensor>();
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    framework::SerializeToStream(fout, tensor, dev_ctx);
+  }
+};
+
+class SaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SaveOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor ) Input tensor to be saved");
+    AddComment(R"DOC(
+Save operator
+
+This operator will serialize and write a tensor variable to file on disk.
+)DOC");
+    AddAttr<bool>("overwrite",
+                  "(boolean, default true)"
+                  "Overwrite the output file if exist")
+        .SetDefault(true);
+    AddAttr<std::string>("file_path",
+                         "(string)"
+                         "The \"file_path\" where the variable will be saved.")
+        .AddCustomChecker(
+            [](const std::string &path) { return !path.empty(); });
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker);
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..017fc2c00e4016052179acfe328cdda42d6f84de
--- /dev/null
+++ b/paddle/fluid/operators/scale_op.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/scale_op.h"
+#include "paddle/fluid/operators/net_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ScaleOp : public framework::OperatorWithKernel {
+ public:
+  ScaleOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ScaleOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ScaleOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+template <typename AttrType>
+class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor of scale operator.");
+    AddOutput("Out", "(Tensor) Output tensor of scale operator.");
+    AddComment(R"DOC(
+Scale operator
+
+$$Out = scale*X$$
+)DOC");
+    AddAttr<AttrType>("scale",
+                      "(float, default 1.0)"
+                      "The scaling factor of the scale operator.")
+        .SetDefault(1.0);
+  }
+};
+
+class ScaleGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("scale");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("scale", GetAttr("scale"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
+                  ops::ScaleGradMaker);
+REGISTER_OP_CPU_KERNEL(
+    scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/scale_op.cu b/paddle/fluid/operators/scale_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a9b46077aa07406fef2cba5b18d190501ce2f92a
--- /dev/null
+++ b/paddle/fluid/operators/scale_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/scale_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    scale,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
+                                   int64_t>);
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b1c2964ca6385dc6fb81f61a3e5bb042f5d7019f
--- /dev/null
+++ b/paddle/fluid/operators/scale_op.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class ScaleKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    tensor->mutable_data<T>(in->place());
+
+    auto scale = static_cast<T>(context.Attr<float>("scale"));
+
+    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+    auto& dev =
+        *context.template device_context<DeviceContext>().eigen_device();
+    eigen_out.device(dev) = scale * eigen_in;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f1b9426a745ac293bd756da6ee750119879429e
--- /dev/null
+++ b/paddle/fluid/operators/scatter.cu.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void ScatterCUDAKernel(const T* params, const int* indices,
+                                  T* output, size_t index_size,
+                                  size_t slice_size) {
+  CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
+    int indices_i = i / slice_size;
+    int slice_i = i - indices_i * slice_size;  // offset inside the slice
+    int scatter_i = indices[indices_i];
+    int out_i = scatter_i * slice_size + slice_i;
+    *(output + out_i) = *(params + i);
+  }
+}
+
+/**
+ * A thin wrapper on gpu tensor
+ * Return a new updated tensor from source tensor, scatter-assigned according to
+ * index
+ * input[src]: type-T source Tensor
+ * input[index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T>
+void GPUScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
+                      const Tensor& index, Tensor* output) {
+  // PADDLE_ENFORCE(platform::is_gpu_place(place));
+  // check index of shape 1-D
+  PADDLE_ENFORCE(index.dims().size() == 1);
+  int index_size = index.dims()[0];
+
+  auto src_dims = src.dims();
+  framework::DDim output_dims(src_dims);
+  output_dims[0] = index_size;
+
+  // slice size
+  int slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  const T* p_src = src.data<T>();
+  const int* p_index = index.data<int>();
+  T* p_output = output->data<T>();
+
+  int block = 512;
+  int n = slice_size * index_size;
+  int grid = (n + block - 1) / block;
+
+  ScatterCUDAKernel<T><<<
+      grid, block, 0,
+      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+      p_src, p_index, p_output, index_size, slice_size);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/scatter.h b/paddle/fluid/operators/scatter.h
new file mode 100644
index 0000000000000000000000000000000000000000..70cae1286caf10323e8e424853f1dc14f84b110c
--- /dev/null
+++ b/paddle/fluid/operators/scatter.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cstring>
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+/**
+ * Return a updated tensor from source tensor, scattered according to index:
+ * dst[i] = src[index[i]]
+ * input[src]: type-T source Tensor
+ * input[index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T>
+void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
+                   const Tensor& index, Tensor* output) {
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
+  // check index of shape 1-D
+  PADDLE_ENFORCE(index.dims().size() == 1);
+  int index_size = index.dims()[0];
+
+  auto src_dims = src.dims();
+  auto dst_dims = output->dims();
+
+  const T* p_src = src.data<T>();
+  const int* p_index = index.data<int>();
+  T* p_output = output->data<T>();
+
+  // check src shape and dst shape should match
+  for (int i = 1; i < src_dims.size(); i++)
+    PADDLE_ENFORCE(src_dims[i] == dst_dims[i]);
+
+  // slice size
+  size_t slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  const size_t slice_bytes = slice_size * sizeof(T);
+
+  for (int i = 0; i < index_size; ++i) {
+    int index_ = p_index[i];
+    memcpy(p_output + index_ * slice_size, p_src + i * slice_size, slice_bytes);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e35930af53463e18e5ecca3cf41b91ed58a7c4c2
--- /dev/null
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/scatter_op.h"
+#include "paddle/fluid/framework/ddim.h"
+
+namespace paddle {
+namespace operators {
+
+class ScatterOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Ref"),
+                   "Input(Ref) of ScatterOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Index"),
+                   "Input(Index) of ScatterOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Updates"),
+                   "Input(Updates) of ScatterOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ScatterOp should not be null.");
+
+    auto updates_dims = ctx->GetInputDim("Updates");
+    auto ref_dims = ctx->GetInputDim("Ref");
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Index").size(), 1,
+                      "Update Index should be 1-D.");
+    PADDLE_ENFORCE_EQ(ref_dims.size(), updates_dims.size(),
+                      "Reference and Updates should have the same shape size");
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0],
+                      ctx->GetInputDim("Index")[0],
+                      "Updates and Index should have same batch-size.");
+    framework::DDim data_dim(updates_dims);
+    for (int i = 1; i < data_dim.size(); ++i) {
+      PADDLE_ENFORCE_EQ(data_dim[i], updates_dims[i]);
+    }
+    ctx->SetOutputDim("Out", ref_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
+        ctx.device_context());
+  }
+};
+
+class ScatterGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("Updates"),
+                      ctx->GetInputDim("Updates"));
+    ctx->SetOutputDim(framework::GradVarName("Ref"), ctx->GetInputDim("Ref"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
+        ctx.device_context());
+  }
+};
+
+class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Ref", "The source input of scatter op");
+    AddInput("Index",
+             "The index input of scatter op where Ref will be updated");
+    AddInput("Updates", "The updated value of updates op");
+    AddOutput("Out", "The output of add op");
+    AddComment(R"DOC(
+Scatter Operator.
+
+This operator obtains output by updating the input on selected indices on the first axis:
+
+$$
+Out = Ref \\
+Out[Index] = Ref[Index] + Updates
+$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad,
+            ops::ScatterGradOp);
+REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>);
+REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>);
diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f9eaae33a802ed1a45184a24757e3883fad5e639
--- /dev/null
+++ b/paddle/fluid/operators/scatter_op.cu
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gather.cu.h"
+#include "paddle/fluid/operators/gather_op.h"
+#include "scatter.cu.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ScatterOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto *Ref = ctx.Input<Tensor>("Ref");
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *Updates = ctx.Input<Tensor>("Updates");
+    auto *Out = ctx.Output<Tensor>("Out");
+
+    Out->ShareDataWith(*Ref);
+
+    GPUScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
+  }
+};
+
+template <typename T>
+class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
+    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    // In place gradient: dRef = dO
+    dRef->ShareDataWith(*dOut);
+    dUpdates->mutable_data<T>(ctx.GetPlace());
+    // Gradient by Gather: dUpdates = dO[Index]
+    GPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(scatter, ops::ScatterOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel<float>);
diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..65d10546328780e09bb57876acf2326d98803847
--- /dev/null
+++ b/paddle/fluid/operators/scatter_op.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "gather.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "scatter.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ScatterOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
+    auto *Ref = ctx.Input<Tensor>("Ref");
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *Updates = ctx.Input<Tensor>("Updates");
+    auto *Out = ctx.Output<Tensor>("Out");
+
+    // In place output: Out = Ref, Out[Index] += Updates
+    Out->ShareDataWith(*Ref);
+    // Apply ScatterUpdate: Out[index] += Updates[:]
+    ScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
+  }
+};
+
+template <typename T>
+class ScatterGradientOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
+    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
+    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    // In place gradient: dRef = dO
+    dRef->ShareDataWith(*dOut);
+    dUpdates->mutable_data<T>(ctx.GetPlace());
+    // Gradient by Gather: dUpdates += dO[Index]
+    CPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8fb5ef96af34e5bd2dc0802ea76456a8b47749ab
--- /dev/null
+++ b/paddle/fluid/operators/scatter_test.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/scatter.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <string>
+
+TEST(scatter, ScatterUpdate) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators;
+
+  Tensor* src = new Tensor();
+  Tensor* index = new Tensor();
+  Tensor* output = new Tensor();
+
+  float* p_src = nullptr;
+  int* p_index = nullptr;
+  p_src = src->mutable_data<float>(make_ddim({1, 4}), CPUPlace());
+  p_index = index->mutable_data<int>(make_ddim({1}), CPUPlace());
+
+  for (size_t i = 0; i < 4; ++i) p_src[i] = float(i);
+  p_index[0] = 1;
+
+  float* p_output = output->mutable_data<float>(make_ddim({4, 4}), CPUPlace());
+
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  paddle::platform::CPUDeviceContext ctx(*cpu_place);
+  ScatterAssign<float>(ctx, *src, *index, output);
+
+  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], float(0));
+  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
+  for (size_t i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], float(i - 4));
+  for (size_t i = 4; i < 8; ++i)
+    EXPECT_EQ(output->data<float>()[i], float(i - 4));
+  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], float(0));
+  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
+
+  delete src;
+  delete index;
+  delete output;
+}
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a8390aa6596c69f85e3ef736dda9dd99c3fd6dba
--- /dev/null
+++ b/paddle/fluid/operators/send_op.cc
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <ostream>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#include <future>
+#include "paddle/fluid/operators/detail/grpc_client.h"
+
+namespace paddle {
+namespace operators {
+
+class SendOp : public framework::OperatorBase {
+ public:
+  SendOp(const std::string& type, const framework::VariableNameMap& inputs,
+         const framework::VariableNameMap& outputs,
+         const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope& scope,
+           const platform::Place& place) const override {
+    auto ins = Inputs("X");
+    auto outs = Outputs("Out");
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    std::vector<std::string> endpoints =
+        Attr<std::vector<std::string>>("endpoints");
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(place);
+
+    auto client_var_name = Output("RPCClient");
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
+                            "Can not find variable '%s' in the scope.",
+                            client_var_name);
+    auto* client_var = scope.FindVar(client_var_name);
+    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+
+    for (size_t i = 0; i < ins.size(); i++) {
+      VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
+      rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
+    }
+    PADDLE_ENFORCE(rpc_client->Wait());
+
+    for (auto& ep : endpoints) {
+      VLOG(3) << "batch barrier, ep: " << ep;
+      rpc_client->AsyncSendBatchBarrier(ep);
+    }
+    PADDLE_ENFORCE(rpc_client->Wait());
+
+    if (outs.size() > 0) {
+      for (size_t i = 0; i < outs.size(); i++) {
+        VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
+        rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
+      }
+      PADDLE_ENFORCE(rpc_client->Wait());
+    }
+  }
+};
+
+class SendOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SendOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable();
+    AddOutput("Out", "(Tensor) Output tensor to be received from server")
+        .AsDuplicable();
+    AddOutput("RPCClient",
+              "(RPCClient) The RPC client object which is"
+              "initialized at most once.");
+    AddComment(R"DOC(
+Send operator
+
+This operator will send tensor to recv_op at the parameter server.
+)DOC");
+    // TODO(typhoonzero): remove this attr generate de-duplicated vector from
+    // epmap when initializing.
+    AddAttr<std::vector<std::string>>("endpoints",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints to send variables to.")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>("epmap",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints in the order of input "
+                                      "variables for mapping")
+        .SetDefault({});
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(send, ops::SendOp, ops::SendOpMaker);
diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..716f687044a85d46676141ee125baf398e9e695d
--- /dev/null
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -0,0 +1,207 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/string/printf.h"
+
+USE_NO_KERNEL_OP(send);
+USE_NO_KERNEL_OP(listen_and_serv);
+USE_OP(sum);
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+// global for simplicity.
+std::unique_ptr<f::OperatorBase> listen_and_serv_op;
+
+void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
+  p::CPUDeviceContext ctx(place);
+  for (int i = 0; i < 2; ++i) {
+    auto var_name = paddle::string::Sprintf("x%d", i);
+    auto var = scope.Var(var_name);
+    auto tensor = var->GetMutable<f::LoDTensor>();
+    tensor->Resize({10, 10});
+    float *expect = tensor->mutable_data<float>(place);
+    for (int64_t i = 0; i < tensor->numel(); ++i) {
+      expect[i] = static_cast<float>(i);
+    }
+  }
+
+  auto out_var = scope.Var("Out");
+  auto out_tensor = out_var->GetMutable<f::LoDTensor>();
+  out_tensor->Resize({10, 10});
+  out_tensor->mutable_data<float>(place);  // allocate
+}
+
+void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
+  p::CPUDeviceContext ctx(place);
+  int64_t height = 10;
+  int64_t row_numel = 10;
+  m::SetConstant<p::CPUDeviceContext, float> set_one;
+  // init x0
+  std::vector<int64_t> rows0{0, 4, 7};
+  auto x0_var = scope.Var("x0");
+  auto x0 = x0_var->GetMutable<f::SelectedRows>();
+  x0->set_rows(rows0);
+  x0->set_height(height);
+  auto x0_value = x0->mutable_value();
+  x0_value->mutable_data<float>(
+      f::make_ddim({static_cast<int64_t>(rows0.size()), row_numel}), place);
+  set_one(ctx, x0_value, 1.0);
+
+  // init x1
+  std::vector<int64_t> rows1{2, 9};
+  auto x1_var = scope.Var("x1");
+  auto x1 = x1_var->GetMutable<f::SelectedRows>();
+  x1->set_rows(rows1);
+  x1->set_height(height);
+  auto x1_value = x1->mutable_value();
+  x1_value->mutable_data<float>(
+      f::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), place);
+  set_one(ctx, x1_value, 1.0);
+
+  auto out_var = scope.Var("Out");
+  auto out = out_var->GetMutable<f::SelectedRows>();
+  auto out_value = out->mutable_value();
+  out->set_height(height);
+  out_value->mutable_data<float>(f::make_ddim({5, 10}), place);
+}
+
+void AddOp(const std::string &type, const f::VariableNameMap &inputs,
+           const f::VariableNameMap &outputs, f::AttributeMap attrs,
+           f::BlockDesc *block) {
+  // insert output
+  for (auto kv : outputs) {
+    for (auto v : kv.second) {
+      auto var = block->Var(v);
+      var->SetDataType(f::proto::DataType::FP32);
+    }
+  }
+
+  // insert op
+  auto op = block->AppendOp();
+  op->SetType(type);
+  for (auto &kv : inputs) {
+    op->SetInput(kv.first, kv.second);
+  }
+  for (auto &kv : outputs) {
+    op->SetOutput(kv.first, kv.second);
+  }
+  op->SetAttrMap(attrs);
+}
+
+void StartServerNet(bool is_sparse) {
+  f::Scope scope;
+  p::CPUPlace place;
+  if (is_sparse) {
+    InitSelectedRowsInScope(scope, place);
+  } else {
+    InitTensorsInScope(scope, place);
+  }
+
+  // sub program run in listen_and_serv_op, for simple test we use sum
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  // X for server side tensors, RX for received tensers, must be of same shape.
+  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, block);
+
+  f::AttributeMap attrs;
+  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
+  attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
+  attrs.insert({"GradList", std::vector<std::string>({"x1"})});
+  attrs.insert({"OptimizeBlock", block});
+  listen_and_serv_op =
+      f::OpRegistry::CreateOp("listen_and_serv", {}, {}, attrs);
+  listen_and_serv_op->Run(scope, place);
+}
+
+TEST(SendRecvOp, CPUDense) {
+  std::thread server_thread(StartServerNet, false);
+  sleep(10);  // wait server to start
+  // local net
+  f::Scope scope;
+  p::CPUPlace place;
+  InitTensorsInScope(scope, place);
+
+  f::AttributeMap attrs;
+  attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
+  attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
+  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
+                                         {{"Out", {"Out"}}}, attrs);
+  send_op->Run(scope, place);
+
+  auto in_var = scope.Var("x1");
+  auto tensor = in_var->GetMutable<f::LoDTensor>();
+  float *expected = tensor->data<float>();
+  auto out_var = scope.Var("Out");
+  auto target = out_var->GetMutable<f::LoDTensor>();
+  // x1 * 2 == x0
+  EXPECT_NE(target->memory_size(), size_t(0));
+  float *actual = target->data<float>();
+  for (int64_t i = 0; i < target->numel(); ++i) {
+    EXPECT_EQ(expected[i] * 2, actual[i]);
+  }
+  listen_and_serv_op->Stop();
+  server_thread.join();
+  listen_and_serv_op.reset(nullptr);
+}
+
+TEST(SendRecvOp, CPUSparse) {
+  std::thread server_thread(StartServerNet, true);
+  sleep(3);  // wait server to start
+  // local net
+  f::Scope scope;
+  p::CPUPlace place;
+  p::CPUDeviceContext ctx(place);
+  InitSelectedRowsInScope(scope, place);
+  f::AttributeMap attrs;
+  attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
+  attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
+  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
+                                         {{"Out", {"Out"}}}, attrs);
+  send_op->Run(scope, place);
+
+  auto x0 = scope.Var("x0")->GetMutable<f::SelectedRows>();
+  auto x1 = scope.Var("x1")->GetMutable<f::SelectedRows>();
+  auto out = scope.Var("Out")->GetMutable<f::SelectedRows>();
+  auto actual = out->mutable_value();
+
+  std::unique_ptr<f::SelectedRows> expect{new f::SelectedRows()};
+  auto expect_value = expect->mutable_value();
+  expect_value->mutable_data<float>(f::make_ddim({5, 10}), place);
+
+  m::SelectedRowsAdd<p::CPUDeviceContext, float> add_functor;
+  add_functor(ctx, *x0, *x1, expect.get());
+
+  EXPECT_EQ(actual->numel(), expect_value->numel());
+  EXPECT_EQ(out->rows().size(), x0->rows().size() + x1->rows().size());
+
+  for (int64_t i = 0; i < expect_value->numel(); ++i) {
+    EXPECT_EQ(expect_value->mutable_data<float>(place)[i],
+              actual->mutable_data<float>(place)[i]);
+  }
+  listen_and_serv_op->Stop();
+  server_thread.join();
+  listen_and_serv_op.reset();
+}
diff --git a/paddle/fluid/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ddf800d85e11aa631255c1b1ec5c12f6e0f221c
--- /dev/null
+++ b/paddle/fluid/operators/sequence_concat_op.cc
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_concat_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceConcatOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInputs("X"),
+                   "Inputs(X) of SequenceConcatOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceConcatOp should not be null.");
+    const size_t level = static_cast<size_t>(ctx->Attrs().Get<int>("level"));
+    const size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
+    PADDLE_ENFORCE(level == 0UL || level == 1UL,
+                   "The sequence_concat operator only accepts sequence "
+                   "or a nested sequence as its input.");
+    auto ins_dims = ctx->GetInputsDim("X");
+    framework::DDim out_dims = ins_dims[0];
+    const size_t n = ins_dims.size();
+    for (size_t i = 1; i < n; ++i) {
+      out_dims[axis] += ins_dims[i][axis];
+    }
+    ctx->SetOutputDim("Out", out_dims);
+  }
+};
+
+class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceConcatOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LodTensorArray) Input is a vector of LoDTensor, "
+             "each of which is a variable-length sequence or nested sequence.")
+        .AsDuplicable();
+    AddOutput("Out",
+              "(LoDTensor), Variable-length output of "
+              "sequence_concat Op.");
+    AddAttr<int>("axis",
+                 "(int, default 0) "
+                 "The axis along which the inputs will be joined. "
+                 "If axis is 0, the inputs will be joined with LoD index.")
+        .SetDefault(0);
+    AddAttr<int>("level",
+                 "(int, default 0) "
+                 "The level at which the inputs will be joined. "
+                 "If the level is 0, the inputs will be joined at the nested "
+                 "sequence level. "
+                 "If the level is 1, the inputs will be joined at the "
+                 "sequence level. "
+                 "The level should be less than the level number of inputs.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+The sequence_concat operator concatenates multiple LoDTensors.
+It only supports sequence (LoD Tensor with level number is 1)
+or a nested sequence (LoD tensor with level number is 2) as its input.
+- Case1:
+  If the axis is other than 0(here, axis is 1 and level is 1),
+  each input should have the same LoD information and the LoD
+  information of the output keeps the same as the input.
+
+  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,2,4}, {0,1,2,3,4}}; Dims(x1) = (4,4,4)
+  LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4)
+
+- Case2:
+  If the axis is 0(here, leve is 0), the inputs are concatenated along
+  time steps, the LoD information of the output need to re-compute.
+  The LoD information of level-1 should be same.
+
+  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,2,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4)
+  LoD(Out) = {{0,2,4}, {0,2,5,8,11}}; Dims(Out) = (11,3,4)
+
+- Case3:
+  If the axis is 0(here, level is 1).
+
+  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,3,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4)
+  LoD(Out) = {{0,5,8}, {0,1,2,3,5,7,8,9,11}}; Dims(Out) = (11,3,4)
+
+- Case4:
+  If the LoD number is 1, axis is 0, level is 0
+
+  LoD(x0) = {{0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,1,3,5,7}}; Dims(x1) = (7,3,4)
+  LoD(Out) = {{0,2,5,8,11}}; Dims(Out) = (11,3,4)
+
+NOTE: The levels of all the inputs should be the same.
+    )DOC");
+  }
+};
+
+class SequenceConcatGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
+                   "The gradient of X should not be null.");
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_EX(sequence_concat, ops::SequenceConcatOp,
+               ops::SequenceConcatOpMaker, sequence_concat_grad,
+               ops::SequenceConcatGradOp, false);
+REGISTER_OP_CPU_KERNEL(
+    sequence_concat,
+    ops::SequenceConcatOpKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_concat_grad,
+    ops::SequenceConcatGradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_concat_op.cu.cc b/paddle/fluid/operators/sequence_concat_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c5a280ef9e2515114f5dd6826e55a304066973aa
--- /dev/null
+++ b/paddle/fluid/operators/sequence_concat_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_concat_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_concat,
+    ops::SequenceConcatOpKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(sequence_concat_grad,
+                        ops::SequenceConcatGradOpKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_concat_op.h b/paddle/fluid/operators/sequence_concat_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9121196369f1bee20abc56a33b9da8bc4a43f315
--- /dev/null
+++ b/paddle/fluid/operators/sequence_concat_op.h
@@ -0,0 +1,172 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+template <typename T>
+LoD ConcatLoD(const std::vector<const T*> ins, const size_t level) {
+  auto out_lod = ins[0]->lod();
+  auto numLevels = ins[0]->NumLevels();
+  const size_t n = ins.size();
+  const size_t level_idx = ins[0]->NumLevels() - 1 - level;
+  for (size_t i = 1; i < n; ++i) {
+    for (size_t j = 0; j < ins[i]->lod()[level_idx].size(); ++j) {
+      out_lod[level_idx][j] += ins[i]->lod()[level_idx][j];
+    }
+  }
+
+  for (size_t i = level_idx; i < numLevels - 1; ++i) {
+    size_t lod_len = 1;
+    for (size_t j = 0; j < n; ++j) {
+      lod_len += ins[j]->lod()[i + 1].size() - 1;
+    }
+    out_lod[i + 1].clear();
+    out_lod[i + 1].resize(lod_len);
+
+    size_t idx = 1;
+    for (size_t j = 0; j < ins[0]->lod()[i].size() - 1; ++j) {
+      for (size_t k = 0; k < n; ++k) {
+        for (size_t m = ins[k]->lod()[i][j]; m < ins[k]->lod()[i][j + 1]; ++m) {
+          out_lod[i + 1][idx] = out_lod[i + 1][idx - 1] +
+                                ins[k]->lod()[i + 1][m + 1] -
+                                ins[k]->lod()[i + 1][m];
+          idx++;
+        }
+      }
+    }
+  }
+
+  return out_lod;
+}
+
+template <typename DeviceContext, typename T>
+class SequenceConcatOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+    const size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
+    const size_t level = static_cast<size_t>(ctx.Attr<int>("level"));
+    const size_t n = ins.size();
+
+    for (size_t i = 1; i < n; ++i) {
+      PADDLE_ENFORCE_EQ(ins[0]->NumLevels(), ins[i]->NumLevels(),
+                        "The levels of all the input LoDTensors "
+                        "should be the same.");
+      PADDLE_ENFORCE_EQ(ins[0]->dims().size(), ins[i]->dims().size(),
+                        "The dimension size of all the input LoDTensors "
+                        "should be the same.");
+
+      const size_t dims_size = ins[i]->dims().size();
+      for (size_t j = 0; j < dims_size; ++j) {
+        if (j == axis) continue;
+        PADDLE_ENFORCE_EQ(ins[0]->dims()[j], ins[i]->dims()[j],
+                          "Except for the dimension of the specified "
+                          "axis along which all the inputs are concatenated, "
+                          "dimensions of all the other axises of the input "
+                          "LoDTensors should be the same.");
+      }
+    }
+    PADDLE_ENFORCE_GT(ins[0]->NumLevels(), level,
+                      "The levels of all the input LoDTensors "
+                      "should be greater than the specify level");
+
+    out->mutable_data<T>(ctx.GetPlace());
+    auto out_lod = ins[0]->lod();
+    if (axis == 0) {
+      out_lod = ConcatLoD<LoDTensor>(ins, level);
+    }
+    out->set_lod(out_lod);
+
+    const size_t level_idx = out_lod.size() - level - 1;
+    auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx];
+    for (size_t i = 0; i < out_lod_level.size() - 1; ++i) {
+      Tensor out_t = out->Slice(static_cast<int>(out_lod_level[i]),
+                                static_cast<int>(out_lod_level[i + 1]));
+      auto out_stride = framework::stride(out_t.dims());
+      size_t offset = 0;
+      for (size_t j = 0; j < n; ++j) {
+        auto in_lod_level = framework::ToAbsOffset(ins[j]->lod())[level_idx];
+        auto in_stride = framework::stride(ins[j]->dims());
+        Tensor in_t = ins[j]->Slice(static_cast<int>(in_lod_level[i]),
+                                    static_cast<int>(in_lod_level[i + 1]));
+        size_t axis_dim = in_t.dims()[axis];
+        StridedMemcpy<T>(ctx.device_context(), in_t.data<T>(), in_stride,
+                         in_t.dims(), out_stride, out_t.data<T>() + offset);
+        offset += axis_dim * in_stride[axis];
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SequenceConcatGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<framework::LoDTensor>("X");
+    auto* out_grad =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto x_grads =
+        ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
+    size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
+    size_t level = static_cast<size_t>(ctx.Attr<int>("level"));
+    const size_t n = x_grads.size();
+
+    // Set Grad(X) LoD as X
+    for (size_t i = 0; i < n; i++) {
+      x_grads[i]->set_lod(ins[i]->lod());
+      x_grads[i]->mutable_data<T>(ctx.GetPlace());
+    }
+    auto out_lod = ins[0]->lod();
+    if (axis == 0UL) {
+      out_lod = ConcatLoD<LoDTensor>(ins, level);
+    }
+    const size_t level_idx = out_lod.size() - level - 1;
+    auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx];
+
+    for (size_t i = 0; i < out_lod_level.size() - 1; ++i) {
+      Tensor out_grad_t =
+          out_grad->Slice(static_cast<int>(out_lod_level[i]),
+                          static_cast<int>(out_lod_level[i + 1]));
+      auto out_grad_stride = framework::stride(out_grad_t.dims());
+      size_t offset = 0;
+
+      for (size_t j = 0; j < n; ++j) {
+        auto x_grad_lod_level =
+            framework::ToAbsOffset(x_grads[j]->lod())[level_idx];
+        auto x_grad_stride = framework::stride(x_grads[j]->dims());
+        Tensor x_grad_t =
+            x_grads[j]->Slice(static_cast<int>(x_grad_lod_level[i]),
+                              static_cast<int>(x_grad_lod_level[i + 1]));
+        size_t axis_dim = x_grad_t.dims()[axis];
+        StridedMemcpy<T>(ctx.device_context(), out_grad_t.data<T>() + offset,
+                         out_grad_stride, out_grad_t.dims(), x_grad_stride,
+                         x_grad_t.data<T>());
+        offset += axis_dim * out_grad_stride[axis];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_conv_op.cc b/paddle/fluid/operators/sequence_conv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..af9938b18069d65648fbbf0deae31eff088b791f
--- /dev/null
+++ b/paddle/fluid/operators/sequence_conv_op.cc
@@ -0,0 +1,187 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_conv_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceConvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                   "Input(Filter) of SequenceConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceConvOp should not be null.");
+
+    int context_length = ctx->Attrs().Get<int>("contextLength");
+    int context_start = ctx->Attrs().Get<int>("contextStart");
+
+    auto in_dims = ctx->GetInputDim("X");
+    auto filter_dims = ctx->GetInputDim("Filter");
+    PADDLE_ENFORCE(ctx->Attrs().Get<int>("contextStride") == 1,
+                   "Currently, SequenceConvOp only supports contextStride=1.");
+    PADDLE_ENFORCE(in_dims.size() == 2 && filter_dims.size() == 2,
+                   "Input(X, Filter) should be 2-D tensor.");
+    PADDLE_ENFORCE(filter_dims[0] == context_length * in_dims[1],
+                   "Filter's height should be context_length * "
+                   "input_hidden_size .");
+
+    if (ctx->Attrs().Get<bool>("paddingTrainable")) {
+      PADDLE_ENFORCE(
+          ctx->HasInput("PaddingData"),
+          "Input(PaddingData) of SequenceConvOp should not be null.");
+      framework::DDim padding_dim = ctx->GetInputDim("PaddingData");
+      int up_pad = std::max(0, -context_start);
+      int down_pad = std::max(0, context_start + context_length - 1);
+      int total_pad = up_pad + down_pad;
+      int input_width = static_cast<int>(in_dims[1]);
+
+      if (context_start == 0 && context_length == 1) {
+        PADDLE_THROW(
+            "If context_start is 0 and context_length is 1, paddingTrainable "
+            "should be false.");
+      }
+      PADDLE_ENFORCE(padding_dim.size() == 2,
+                     "Input(PaddingData) should be 2-D tensor.");
+      PADDLE_ENFORCE(
+          padding_dim[0] == total_pad && padding_dim[1] == input_width,
+          "Input(PaddingData)'s shape is not consistent with 'context_start' "
+          "and 'context_length'.");
+    }
+
+    in_dims[1] = filter_dims[1];
+    ctx->SetOutputDim("Out", in_dims);
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+class SequenceConvGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Gradient of output(Out) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "The input(X) should not be null.");
+
+    if (ctx->Attrs().Get<bool>("paddingTrainable") &&
+        ctx->HasOutput(framework::GradVarName("PaddingData"))) {
+      ctx->SetOutputDim(framework::GradVarName("PaddingData"),
+                        ctx->GetInputDim("PaddingData"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+      ctx->ShareLoD("X", framework::GradVarName("X"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Filter"))) {
+      ctx->SetOutputDim(framework::GradVarName("Filter"),
+                        ctx->GetInputDim("Filter"));
+    }
+  }
+};
+
+class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceConvOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(LoDTensor) the input(X) is a LodTensor, which supports "
+        "variable-time length input sequence. The underlying tensor in "
+        "this LoDTensor is a matrix with shape (T, N), where T is the "
+        "total time steps in this mini-batch and N is the input_hidden_size.");
+    AddInput("PaddingData",
+             "(Tensor, optional) the input(PaddingData) is an optional "
+             "parameter, and it is learnable. "
+             "This is a tensor with shape (P, N), where P is the "
+             "top_pad + bottom_pad, N is the input_hidden_size. In order to "
+             "ensure the equal length of sequence before and after "
+             "convolution, it is necessary to fill the top and bottom of each "
+             "sequence according to context_length, context_stride and "
+             "context_start")
+        .AsDispensable();
+    AddInput(
+        "Filter",
+        "(Tensor) the input(Filter) is an learnable parameter."
+        "This is a tensor with shape (K, M), where K is the "
+        "context_length * input_hidden_size, M is the output feature size.");
+    AddOutput(
+        "Out",
+        "(LoDTensor) the output(Out) is a LodTensor, which support "
+        "variable-time length output sequence. The underlying tensor in "
+        "this LoDTensor is a matrix with shape (T, M), where, T is the "
+        "total time steps in this mini-batch, M is the output feature size.");
+
+    AddAttr<bool>("paddingTrainable",
+                  "(bool, default:false) the padding data of SequenceConvOp "
+                  "is trainable or not.")
+        .SetDefault(false);
+    AddAttr<int>("contextLength",
+                 "(int) the contextLength of SequenceConvOp is the "
+                 "height of the convolution kernel.")
+        .GreaterThan(0);
+    AddAttr<int>("contextStart",
+                 "(int, default:0) the contextStart of SequenceConvOp "
+                 "represents the beginning of the convolution of the number of "
+                 "rows of sequence, which can be negative. The negative number "
+                 "means to pad contextStart time-steps of zeros or learnable "
+                 "parameters at the beginning of each instance. The positive "
+                 "number means to skip contextStart time-steps of each "
+                 "instance.")
+        .SetDefault(0);
+    AddAttr<int>("contextStride",
+                 "(int, default:1) the contextStride of SequenceConvOp "
+                 "represents the stride length of convolution kernel. "
+                 "Currently, SequenceConvOp only supports"
+                 "contextStride=1.")
+        .SetDefault(1)
+        .GreaterThan(0);
+
+    AddComment(R"DOC(
+Sequence Conv Operator.
+
+SequenceConvOp performs convolution operation on features of contextLength
+time-steps of each instance. The convolution operation calculates the output
+based on the input, filter, strides and paddings parameters.
+The size of each dimension of the parameters is checked during infer-shape.
+In order to ensure the equal length of sequence before and after convolution,
+it is necessary to fill the top and bottom of each sequence based on
+context_length, context_stride and context_start.
+
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
+            sequence_conv_grad, ops::SequenceConvGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    sequence_conv,
+    ops::SequenceConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceConvKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_conv_grad,
+    ops::SequenceConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceConvGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/sequence_conv_op.cu.cc b/paddle/fluid/operators/sequence_conv_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..36f9e8da95d8c963c74fb6c8e75c777b7ba03095
--- /dev/null
+++ b/paddle/fluid/operators/sequence_conv_op.cu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_conv_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_conv,
+    ops::SequenceConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceConvKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_conv_grad,
+    ops::SequenceConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceConvGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/sequence_conv_op.h b/paddle/fluid/operators/sequence_conv_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c81067fea2370458cf6abe8e5465b4c674fbf09
--- /dev/null
+++ b/paddle/fluid/operators/sequence_conv_op.h
@@ -0,0 +1,160 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/context_project.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class SequenceConvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    auto filter = *context.Input<Tensor>("Filter");
+
+    out->mutable_data<T>(context.GetPlace());
+    context.ShareLoD("X", "Out");
+
+    int context_start = context.Attr<int>("contextStart");
+    int context_length = context.Attr<int>("contextLength");
+    int context_stride = context.Attr<int>("contextStride");
+    bool padding_trainable = context.Attr<bool>("paddingTrainable");
+
+    PADDLE_ENFORCE_EQ(in->lod().size(), 1UL,
+                      "Only support one level sequence now.");
+
+    const Tensor* padding_data = nullptr;
+    if (padding_trainable) {
+      padding_data = context.Input<Tensor>("PaddingData");
+    }
+
+    int up_pad = std::max(0, -context_start);
+    int down_pad = std::max(0, context_start + context_length - 1);
+    int sequence_width = static_cast<int>(in->dims()[1]);
+
+    framework::DDim col_shape = {in->dims()[0],
+                                 context_length * sequence_width};
+    Tensor col;
+    col.mutable_data<T>(col_shape, context.GetPlace());
+    // Because if padding_trainable is false, padding data should be zeros.
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    set_zero(dev_ctx, &col, static_cast<T>(0));
+
+    math::ContextProjectFunctor<DeviceContext, T> seq_project_functor;
+
+    seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable,
+                        context_start, context_length, context_stride, up_pad,
+                        down_pad, &col);
+
+    math::matmul<DeviceContext, T>(dev_ctx, col, false, filter, false,
+                                   static_cast<T>(1.0), out,
+                                   static_cast<T>(0.0));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SequenceConvGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
+    auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* filter_g = context.Output<Tensor>(framework::GradVarName("Filter"));
+    auto* padding_data_g =
+        context.Output<Tensor>(framework::GradVarName("PaddingData"));
+    auto* in = context.Input<LoDTensor>("X");
+    auto* filter = context.Input<Tensor>("Filter");
+
+    int context_start = context.Attr<int>("contextStart");
+    int context_length = context.Attr<int>("contextLength");
+    int context_stride = context.Attr<int>("contextStride");
+    bool padding_trainable = context.Attr<bool>("paddingTrainable");
+
+    PADDLE_ENFORCE_EQ(in->lod().size(), 1UL,
+                      "Only support one level sequence now.");
+    auto lod_g_level_0 = in->lod()[0];
+
+    int up_pad = std::max(0, -context_start);
+    int down_pad = std::max(0, context_start + context_length - 1);
+    int sequence_width = static_cast<int>(in->dims()[1]);
+
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    // use col_shape in the im2col calculation
+    framework::DDim col_shape = {in->dims()[0],
+                                 sequence_width * context_length};
+    Tensor col;
+
+    if (in_g || filter_g || (padding_trainable && padding_data_g)) {
+      col.mutable_data<T>(col_shape, context.GetPlace());
+      // Because if padding_trainable is false, padding data should be zeros.
+      set_zero(dev_ctx, &col, static_cast<T>(0));
+      math::matmul<DeviceContext, T>(dev_ctx, *out_g, false, *filter, true,
+                                     T(1.0), &col, T(1.0));
+    }
+    math::ContextProjectFunctor<DeviceContext, T> seq_project_functor;
+    math::ContextProjectGradFunctor<DeviceContext, T> seq_project_grad_functor;
+
+    if (in_g) {
+      in_g->mutable_data<T>(context.GetPlace());
+      in_g->set_lod(in->lod());
+      set_zero(dev_ctx, in_g, static_cast<T>(0));
+
+      seq_project_grad_functor(dev_ctx, *in_g, padding_trainable, context_start,
+                               context_length, context_stride, up_pad, down_pad,
+                               false, true, padding_data_g, &col);
+    }
+
+    if (padding_trainable && padding_data_g) {
+      padding_data_g->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, padding_data_g, static_cast<T>(0));
+
+      LoDTensor* input = const_cast<LoDTensor*>(in);
+      seq_project_grad_functor(
+          dev_ctx, *input, padding_trainable, context_start, context_length,
+          context_stride, up_pad, down_pad, true, false, padding_data_g, &col);
+    }
+
+    if (filter_g) {
+      filter_g->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, filter_g, static_cast<T>(0));
+
+      Tensor filter_grad = *filter_g;
+      LoDTensor out_grad = *out_g;
+
+      const Tensor* padding_data = nullptr;
+      if (padding_trainable) {
+        padding_data = context.Input<Tensor>("PaddingData");
+      }
+
+      seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable,
+                          context_start, context_length, context_stride, up_pad,
+                          down_pad, &col);
+
+      math::matmul<DeviceContext, T>(dev_ctx, col, true, out_grad, false,
+                                     T(1.0), &filter_grad, T(1.0));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_erase_op.cc b/paddle/fluid/operators/sequence_erase_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2e0adf8b1900f7b7c43001459a7e7c494d854274
--- /dev/null
+++ b/paddle/fluid/operators/sequence_erase_op.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_erase_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceEraseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceEraseOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceEraseOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(x_dims.size() == 2 && x_dims[1] == 1,
+                   "Input(X) of SequenceEraseOp should be a 2-D LoDTensor "
+                   "with the 2nd dimension equal to 1.");
+    ctx->SetOutputDim("Out", x_dims);
+  }
+};
+
+class SequenceEraseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceEraseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(2-D LoDTensor with the 2nd dim. equal to 1) "
+             "Input LoDTensor of SequenceEraseOp.");
+    AddOutput("Out",
+              "(2-D LoDTensor with the 2nd dim. equal to 1) "
+              "Output LoDTensor of SequenceEraseOp.");
+    AddAttr<std::vector<int>>("tokens",
+                              "(vector<int>) Tokens need to be erased from "
+                              "input sequences.");
+    AddComment(R"DOC(
+Sequence Erase Operator.
+
+Sequence erase operator erases tokens specified by Attr(tokens) from the input 
+sequences Input(X), and outputs the remaining data and modifies the LoD 
+information at the same time. For example, given a 2-D LoDTensor
+
+    X = [[2, 2, 6, 1, 3, 9, 6, 1, 0, 1]]^T
+
+with lod = [[0, 3, 6, 10]], there are three sequences in the input:
+   
+     X1 = [[2, 2, 6]]^T, X2 = [[1, 3, 9]]^T and X3 = [[6, 1, 0, 1]]^T.
+
+If the tokens to be erased are Attr(tokens) = [2, 3, 5], after the erasing 
+operation, the three sequences become
+
+    X1' = [[6]]^T, X2' = [[1, 9]]^T and X3' = [[6, 1, 0, 1]]^T.
+
+Hence the LoDTensor Output(Out) should be
+
+    Out = [[6, 1, 9, 6, 1, 0, 1]]^T,
+
+with lod = [[0, 1, 3, 7]].
+
+An example usage for this operator is to remove the special tokens when 
+computing the edit distance between two strings, such as blank, start token, 
+and end token.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(sequence_erase, ops::SequenceEraseOp,
+                             ops::SequenceEraseOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    sequence_erase,
+    ops::SequenceEraseKernel<paddle::platform::CPUDeviceContext, int32_t>,
+    ops::SequenceEraseKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_erase_op.cu b/paddle/fluid/operators/sequence_erase_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..43fc352fe78d03fb54dd90a43e3d37b0646cefce
--- /dev/null
+++ b/paddle/fluid/operators/sequence_erase_op.cu
@@ -0,0 +1,118 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include "paddle/fluid/operators/sequence_erase_op.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+using platform::PADDLE_CUDA_NUM_THREADS;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+__global__ void LabelErasedIdx(const T* in_dat, const int64_t in_len,
+                               const int* tokens, const size_t tokens_len,
+                               size_t* num_erased) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < in_len) {
+    for (size_t i = 0; i < tokens_len; ++i) {
+      if (in_dat[index] == tokens[i]) {
+        num_erased[index + 1] = 1;
+        break;
+      }
+    }
+  }
+}
+
+__global__ void GetOutLod(const size_t* num_erased, const size_t* in_lod,
+                          const size_t lod_len, size_t* out_lod0) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < lod_len) {
+    out_lod0[index] = in_lod[index] - num_erased[in_lod[index]];
+  }
+}
+
+template <typename T>
+__global__ void SetOutput(const T* in_dat, const int64_t in_len,
+                          const size_t* num_erased, T* out_dat) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < in_len) {
+    if (num_erased[index] == num_erased[index + 1]) {
+      out_dat[index - num_erased[index]] = in_dat[index];
+    }
+  }
+}
+
+template <typename T>
+class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+
+    auto lod = in->lod();
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(),
+                      "The actual size mismatches with the LoD information.");
+    auto tokens = ctx.Attr<std::vector<int>>("tokens");
+    auto in_len = in->numel();
+    auto in_dat = in->data<T>();
+    // Copy tokens to GPU
+    thrust::device_vector<int> dev_tokens(tokens.begin(), tokens.end());
+    int* dev_tokens_ptr = thrust::raw_pointer_cast(dev_tokens.data());
+
+    // Count number of elements to be erased
+    thrust::device_vector<size_t> num_erased(in_len + 1, 0);
+    size_t* num_erased_ptr = thrust::raw_pointer_cast(num_erased.data());
+    auto stream = ctx.cuda_device_context().stream();
+    LabelErasedIdx<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
+                     PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        in_dat, in_len, dev_tokens_ptr, tokens.size(), num_erased_ptr);
+    thrust::inclusive_scan(num_erased.begin() + 1, num_erased.end(),
+                           num_erased.begin() + 1);
+
+    // Copy LoD to GPU
+    auto lod0 = lod[0];
+    auto lod_len = lod0.size();
+    const size_t* dev_in_lod_ptr = lod0.CUDAData(ctx.GetPlace());
+
+    // Calc output LoD
+    thrust::device_vector<size_t> dev_out_lod(lod_len);
+    size_t* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data());
+    GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
+                PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr);
+    // Set LoD for output
+    std::vector<size_t> out_lod0(dev_out_lod.begin(), dev_out_lod.end());
+    framework::LoD out_lod;
+    out_lod.push_back(out_lod0);
+    out->set_lod(out_lod);
+
+    // Set output
+    out->Resize({static_cast<int64_t>(out_lod0.back()), 1});
+    auto out_dat = out->mutable_data<T>(ctx.GetPlace());
+    SetOutput<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
+                PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_dat, in_len,
+                                                      num_erased_ptr, out_dat);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(sequence_erase,
+                        paddle::operators::SequenceEraseOpCUDAKernel<int32_t>,
+                        paddle::operators::SequenceEraseOpCUDAKernel<int64_t>);
diff --git a/paddle/fluid/operators/sequence_erase_op.h b/paddle/fluid/operators/sequence_erase_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e151279c7fc20d5e04048080a9432cb723334b75
--- /dev/null
+++ b/paddle/fluid/operators/sequence_erase_op.h
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SequenceEraseKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+
+    auto lod = in->lod();
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(),
+                      "The actual size mismatches with the LoD information.");
+    auto tokens = ctx.Attr<std::vector<int>>("tokens");
+    auto in_len = in->numel();
+    auto in_dat = in->data<T>();
+    auto lod0 = lod[0];
+
+    std::vector<size_t> num_erased(in_len + 1, 0);
+    std::vector<size_t> out_lod0(1, 0);
+    for (size_t i = 0; i < lod0.size() - 1; ++i) {
+      size_t num_out = 0;
+      for (auto j = lod0[i] + 1; j <= lod0[i + 1]; ++j) {
+        num_erased[j] = num_erased[j - 1];
+        if (std::find(tokens.begin(), tokens.end(), in_dat[j - 1]) !=
+            tokens.end()) {
+          num_erased[j] += 1;
+        } else {
+          num_out += 1;
+        }
+      }
+      out_lod0.push_back(out_lod0.back() + num_out);
+    }
+
+    auto out_len = in_len - num_erased[in_len];
+    out->Resize({static_cast<int64_t>(out_len), 1});
+    auto out_dat = out->mutable_data<T>(ctx.GetPlace());
+
+    for (int64_t i = 0; i < in_len; ++i) {
+      if (num_erased[i] == num_erased[i + 1]) {
+        out_dat[i - num_erased[i]] = in_dat[i];
+      }
+    }
+    framework::LoD out_lod;
+    out_lod.push_back(out_lod0);
+    out->set_lod(out_lod);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_expand_op.cc b/paddle/fluid/operators/sequence_expand_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ebce641d2876a4f2329b6d7d7263a6b2a31fcf6
--- /dev/null
+++ b/paddle/fluid/operators/sequence_expand_op.cc
@@ -0,0 +1,153 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_expand_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class SequenceExpandOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"));
+    PADDLE_ENFORCE(ctx->HasOutput("Out"));
+    PADDLE_ENFORCE(ctx->HasInput("Y"));
+    framework::DDim out_dim;
+    out_dim = ctx->GetInputDim("Y");
+    ctx->ShareLoD("Y", "Out");
+    ctx->SetOutputDim("Out", out_dim);
+  }
+};
+
+class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor or LoDTensor) The input(X) of this operator can be a "
+             "LoDTensor or a base Tensor.");
+    AddInput("Y",
+             "(LoDTensor)The reference input(Y) of sequence_expand op."
+             "It must be a LoDTensor with k-level(k>0)."
+             "The input(X) will be expanded according to LOD of input(Y)."
+             "The element numbers of last level in input(Y) "
+             "must be equal to dims[0] of input(X).");
+    AddOutput("Out",
+              "(LodTensor)The output of sequence_expand op."
+              "The lod of output will be as same as input(Y)'s lod.");
+    AddComment(R"DOC(
+Sequence Expand Operator.
+
+This operator expands input(X) according to LOD of input(Y).
+Following are cases to better explain how this works:
+Case 1:
+
+Given a 2-level LoDTensor input(X)
+    X.lod = [[0,       2, 3],
+             [0, 1,    3, 4]]
+    X.data = [a, b, c, d]
+    X.dims = [4, 1]
+and input(Y)
+    Y.lod = [[0,    2,    4],
+             [0, 3, 6, 7, 8]]
+with condition len(Y.lod[-1]) -1 == X.dims[0]
+then we get 2-level LoDTensor
+    Out.lod = [[0,                2,    4],
+               [0,       3,       6, 7, 8]]
+    Out.data = [a, a, a, b, b, b, c, d]
+    Out.dims = [8, 1]
+
+Case 2:
+
+Given a common Tensor input(X)
+    X.data = [a, b, c]
+    X.dims = [3, 1]
+and input(Y)
+    Y.lod = [[0, 2, 3, 6]]
+with condition len(Y.lod[-1]) -1 == X.dims[0]
+then we get 1-level LoDTensor
+    Out.lod = [[0,    2, 3,      6]]
+    Out.data = [a, a, b, c, c, c]
+    Out.dims = [6, 1]
+
+Case 3:
+
+Given a common Tensor input(X)
+    X.data = [[a, b], [c, d], [e, f]]
+    X.dims = [3, 2]
+and input(Y)
+    Y.lod = [[0, 2, 3, 6]]
+with condition len(Y.lod[-1]) -1 == X.dims[0]
+then we get 1-level LoDTensor
+    Out.lod = [[0,           2,     3,                     6]]
+    Out.data = [[a,b], [a,b] [c,d], [e, f], [e, f], [e, f]]
+    Out.dims = [6, 2]
+
+Case 4:
+
+Given 2-level a LoDTensor input(X)
+    X.lod = [[0,       2, 3],
+             [0, 1,    3, 4]]
+    X.data = [a, b, c, d]
+    X.dims = [4, 1]
+and input(Y)
+    Y.lod = [[0,    2,    4],
+             [0, 3, 6, 6, 8]]
+with condition len(Y.lod[-1]) -1 == X.dims[0]
+then we get 2-level LoDTensor
+    Out.lod = [[0,                2,    4],
+               [0,       3,       6, 6, 8]]
+    Out.data = [a, a, a, b, b, b, d, d]
+    Out.dims = [8, 1]
+
+
+)DOC");
+  }
+};
+
+class SequenceExpandOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"));
+    PADDLE_ENFORCE(ctx->HasInput("Out"));
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(sequence_expand, ops::SequenceExpandOp, ops::SequenceExpandOpMaker,
+            sequence_expand_grad, ops::SequenceExpandOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    sequence_expand,
+    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_expand_grad,
+    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5ac76d83da618680502d0add51ae68ac117ad2aa
--- /dev/null
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/sequence_expand_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_expand,
+    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_expand_grad,
+    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8010627ff6f5acbf300b0f3f9281e60b4ebfa94e
--- /dev/null
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class SequenceExpandKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    const T* x_data = x->data<T>();
+    auto x_dims = x->dims();
+    auto* y = context.Input<LoDTensor>("Y");
+    PADDLE_ENFORCE(!y->lod().empty(), "y should have lod");
+    PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims[0]),
+                      y->lod().back().size() - 1,
+                      "The size of last lod level in Input(Y)"
+                      "must be equal to dims[0] of Input(X).");
+    out->set_lod(y->lod());
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+    size_t element_len = framework::product(x_dims) / x_dims[0];
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto out_starts = out->lod().back();
+
+    for (size_t i = 0; i < out_starts.size() - 1; i++) {
+      int scale = out_starts[i + 1] - out_starts[i];
+      Eigen::TensorMap<
+          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
+          x_t(x_data, 1, element_len);
+      Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
+          out_t(out_data, scale, element_len);
+      Eigen::array<int, 2> cast({{scale, 1}});
+      out_t.device(*place) = x_t.broadcast(cast);
+      x_data += element_len;
+      out_data += element_len * scale;
+    }
+  }
+};
+
+/*
+ *Given Grad(Out)
+ *
+ *    Grad(Out).lod = [[0,                            2],
+ *                     [0,              3,            6]]
+ *    Grad(Out).data = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
+ * Then
+ *    Grad(X).data = [(0.1 + 0.2 + 0.3), (0.4 + 0.5 + 0.6)]
+ *                 = [0.6, 1.5]
+ *    Grad(X).lod = Input(X).lod
+ *
+ * */
+template <typename DeviceContext, typename T>
+class SequenceExpandGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* x = context.Input<LoDTensor>("X");
+    auto* out = context.Input<LoDTensor>("Out");
+    auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
+    auto out_last_level = out->lod().back();
+    d_x->set_lod(x->lod());
+    const T* d_out_data = d_out->data<T>();
+    T* d_x_data = d_x->mutable_data<T>(context.GetPlace());
+    size_t element_len = d_out->numel() / d_out->dims()[0];
+    for (size_t i = 0; i < out_last_level.size() - 1; ++i) {
+      size_t repeat = out_last_level[i + 1] - out_last_level[i];
+      Eigen::TensorMap<
+          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
+      d_out_t(d_out_data, static_cast<int>(repeat), element_len);
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>
+      d_x_t(d_x_data, static_cast<int>(element_len));
+      auto place =
+          context.template device_context<DeviceContext>().eigen_device();
+      d_x_t.device(*place) = d_out_t.sum(Eigen::array<int, 1>({{0}}));
+      d_out_data += (repeat * element_len);
+      d_x_data += element_len;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_pool_op.cc b/paddle/fluid/operators/sequence_pool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2cfb336b2e0b31ab20182a36d806506b6af4c139
--- /dev/null
+++ b/paddle/fluid/operators/sequence_pool_op.cc
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_pool_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequencePoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequencePoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequencePoolOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    if (ctx->Attrs().Get<std::string>("pooltype") == "MAX") {
+      PADDLE_ENFORCE(ctx->HasOutput("MaxIndex"),
+                     "Output(MaxIndex) of SequencePoolOp should not be null.");
+      ctx->SetOutputDim("MaxIndex", ctx->GetInputDim("X"));
+    }
+  }
+};
+
+class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequencePoolOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp");
+    AddOutput("Out",
+              "(Tensor) The output of SequencePoolOp does not contain LoD "
+              "infomation.");
+    AddOutput("MaxIndex",
+              "(Tensor<int>) This tensor is used for the sequence max-pooling "
+              "to record the max indexes.")
+        .AsIntermediate();
+    AddAttr<std::string>(
+        "pooltype",
+        "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.")
+        .SetDefault("AVERAGE")
+        .InEnum({"AVERAGE", "SUM", "SQRT", "LAST", "FIRST", "MAX"});
+    AddComment(R"DOC(
+Sequence Pool Operator.
+
+The SequencePoolOp pools features of all time-steps of each instance.
+It supports six pooling types:
+1. AVERAGE: $$Out[i] = \frac{\sum_i X_i}{N}$$
+2. SUM:     $$Out[i] = \sum_jX_{ij}$$
+3. SQRT:    $$Out[i] = \frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}$$
+4. LAST:    Out[i] = last instance in i-th sequence X[i]
+5. FIRST:   Out[i] = first instance in i-th sequence X[i]
+6. MAX:     $$Out[i] = max(X_i)$$
+
+The following example explains how this works:
+For a mini-batch of 3 variable-length sentences,
+containing 2, 3, and 2 time-steps:
+
+Assume X is a [7,M,N] LoDTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2.
+Besides, for the sake of simplicity, we assume M=1 and N=1,
+and the value of X = [[1, 3], [2, 4, 6], [5, 1]].
+
+Thus, Out is a [3,1,1] Tensor without LoD infomation.
+And for different pooltype, the value of Out is as follows:
+
+- AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
+- SUM: [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
+- SQRT: [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
+           6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
+- MAX: [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
+- LAST: [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
+- FIRST: [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
+
+    )DOC");
+  }
+};
+
+class SequencePoolGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "The input X should not be null.");
+    auto og_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(og_dims.size(), x_dims.size(),
+                      "The rank of output grad must equal to Input(X).");
+    for (int64_t i = 1; i < og_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(og_dims[i], x_dims[i], "The dimension mismatch.");
+    }
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", framework::GradVarName("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class SequencePoolGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType("sequence_pool_grad");
+    op_desc_ptr->SetInput("X", Input("X"));
+    if (boost::get<std::string>(GetAttr("pooltype")) == "MAX") {
+      op_desc_ptr->SetInput("MaxIndex", Output("MaxIndex"));
+    }
+    op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op_desc_ptr->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker,
+                  ops::SequencePoolGradOpMaker);
+REGISTER_OPERATOR(sequence_pool_grad, ops::SequencePoolGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_pool,
+    ops::SequencePoolKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_pool_grad,
+    ops::SequencePoolGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_pool_op.cu b/paddle/fluid/operators/sequence_pool_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..364769c39bd1b94935630eb8c16c0e27787139e1
--- /dev/null
+++ b/paddle/fluid/operators/sequence_pool_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/fluid/operators/sequence_pool_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_pool,
+    ops::SequencePoolKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_pool_grad,
+    ops::SequencePoolGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_pool_op.h b/paddle/fluid/operators/sequence_pool_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b67e6201ebb04b3fbda3520347c580fd9501098
--- /dev/null
+++ b/paddle/fluid/operators/sequence_pool_op.h
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence_pooling.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class SequencePoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    std::string pooltype = context.Attr<std::string>("pooltype");
+
+    auto dims = in->dims();
+    auto lod = in->lod();
+    int64_t w = in->numel() / dims[0];
+
+    // InferShape by lod
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_GE(
+        dims[0],
+        /*batch size = */ static_cast<int64_t>(lod[0].size() - 1),
+        "The first dimension of Input(X) must be large than batch size.");
+    dims[0] = lod[0].size() - 1;
+    out->Resize({dims});
+
+    auto lod_level_0 = lod[0];
+
+    out->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    if (pooltype == "MAX") {
+      math::MaxSeqPoolFunctor<DeviceContext, T> max_pool;
+      auto* index = context.Output<Tensor>("MaxIndex");
+      index->Resize({dims});
+      index->mutable_data<int>(context.GetPlace());
+      max_pool(dev_ctx, *in, out, index);
+      return;
+    }
+
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+      Tensor in_t = in->Slice(static_cast<int>(lod_level_0[i]),
+                              static_cast<int>(lod_level_0[i + 1]));
+      Tensor out_t = out->Slice(i, i + 1);
+      int64_t h = static_cast<int64_t>(lod_level_0[i + 1] - lod_level_0[i]);
+      auto in_e = EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
+      auto out_e = EigenVector<T>::Flatten(out_t);
+
+      if (pooltype == "AVERAGE") {
+        out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
+      } else if (pooltype == "SUM") {
+        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}}));
+      } else if (pooltype == "SQRT") {
+        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
+                              std::sqrt(static_cast<T>(h));
+      } else if (pooltype == "LAST") {
+        out_e.device(place) = in_e.chip(h - 1, 0);
+      } else if (pooltype == "FIRST") {
+        out_e.device(place) = in_e.chip(0, 0);
+      } else {
+        PADDLE_THROW("unsupported pooling pooltype");
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SequencePoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out_g = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
+    std::string pooltype = context.Attr<std::string>("pooltype");
+
+    auto dims = in->dims();
+    auto lod = in->lod()[0];
+    int64_t w = in->numel() / dims[0];
+
+    in_g->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    if (pooltype == "MAX") {
+      math::MaxSeqPoolGradFunctor<DeviceContext, T> max_pool_grad;
+      auto* index = context.Input<Tensor>("MaxIndex");
+      max_pool_grad(dev_ctx, *out_g, *index, in_g);
+      return;
+    }
+
+    if (pooltype == "LAST" || pooltype == "FIRST") {
+      // set X@Grad be zero at first when pooltype is LAST/FIRST
+      math::SetConstant<DeviceContext, T> functor;
+      functor(dev_ctx, in_g, 0);
+    }
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+      auto in_g_t =
+          in_g->Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
+      auto out_g_t = out_g->Slice(i, i + 1);
+      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
+      auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
+      auto out_g_e_v = EigenVector<T>::Flatten(out_g_t);
+      Eigen::DSizes<int, 2> bcast(h, 1);
+
+      if (pooltype == "AVERAGE") {
+        in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
+      } else if (pooltype == "SUM") {
+        in_g_e.device(place) = (out_g_e).broadcast(bcast);
+      } else if (pooltype == "SQRT") {
+        in_g_e.device(place) =
+            (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
+      } else if (pooltype == "LAST") {
+        in_g_e.chip(h - 1, 0).device(place) = out_g_e_v;
+      } else if (pooltype == "FIRST") {
+        in_g_e.chip(0, 0).device(place) = out_g_e_v;
+      } else {
+        PADDLE_THROW("unsupported pooling pooltype");
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_reshape_op.cc b/paddle/fluid/operators/sequence_reshape_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c4e42d3eeb5555be693946ccde30ef87f88d0f32
--- /dev/null
+++ b/paddle/fluid/operators/sequence_reshape_op.cc
@@ -0,0 +1,135 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/sequence_reshape_op.h"
+#include "paddle/fluid/framework/ddim.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceReshapeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceReshapeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceReshapeOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_numel = product(x_dims);
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2U, "Rank of Input(X) should be 2.");
+    int new_dim = ctx->Attrs().Get<int>("new_dim");
+    if (ctx->IsRuntime()) {
+      ctx->SetOutputDim("Out",
+                        {x_numel / new_dim, static_cast<int64_t>(new_dim)});
+    } else {
+      // when compiling, the batch size is undetermined, just set to -1
+      ctx->SetOutputDim("Out", {-1, static_cast<int64_t>(new_dim)});
+    }
+  }
+};
+
+class SequenceReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceReshapeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor with shape "
+             "being [N, M].");
+    AddOutput("Out",
+              "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor with "
+              "shape [T, new_dim] where T is calculated based on X.lod, M and "
+              "new_dim.");
+    AddAttr<int>("new_dim", "Sequence dimension of the output LoDTensor.");
+    AddComment(R"DOC(
+Sequence Reshape Operator.
+
+This operator will rearrange the input sequences. The new dimension is set by
+attribute and length of each sequence may change longer or shorter which is
+decided by original length, original dimension and new dimension. The following
+example will help to illustrate the function of this operator:
+
+x is a LoDTensor:
+    x.lod  = [[0, 2, 6]]
+    x.data = [[1, 2], [3, 4],
+              [5, 6], [7, 8], [9, 10], [11, 12]]
+    x.dims = [6, 2]
+
+set new_dim = 4
+
+then out is a LoDTensor:
+    out.lod  = [[0, 1, 3]]
+    out.data = [[1, 2, 3, 4],
+                [5, 6, 7, 8], [9, 10, 11, 12]]
+    out.dims = [3, 4]
+
+Currently, only 1-level LoDTensor is supported and please make sure (original
+length * original dimension) can be divided by new_dim with no remainder for
+each sequence.
+
+)DOC");
+  }
+};
+
+class SequenceReshapeGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput(framework::GradVarName("Out")),
+        "Input(Out@GRAD) of SequenceReshapeGradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceReshapeGradOp should  not be null.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+  }
+};
+
+class SequenceReshapeGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType("sequence_reshape_grad");
+    op_desc_ptr->SetInput("X", Input("X"));
+    op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op_desc_ptr->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sequence_reshape, ops::SequenceReshapeOp,
+                  ops::SequenceReshapeOpMaker, ops::SequenceReshapeGradOpMaker);
+REGISTER_OPERATOR(sequence_reshape_grad, ops::SequenceReshapeGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_reshape,
+    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_reshape_grad,
+    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/sequence_reshape_op.cu b/paddle/fluid/operators/sequence_reshape_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5ca3497396eaa6c811c69af4acf4fa3092cff42a
--- /dev/null
+++ b/paddle/fluid/operators/sequence_reshape_op.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_reshape_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_reshape,
+    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_reshape_grad,
+    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext,
+                                   int64_t>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext, int>);
diff --git a/paddle/fluid/operators/sequence_reshape_op.h b/paddle/fluid/operators/sequence_reshape_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a5d1261da917c6e596ff7b85afbfd95ff90f12a
--- /dev/null
+++ b/paddle/fluid/operators/sequence_reshape_op.h
@@ -0,0 +1,86 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+template <typename DeviceContext, typename T>
+class SequenceReshapeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    int out_width = context.Attr<int>("new_dim");
+
+    auto in_dims = in->dims();
+    int64_t in_width = in_dims[1];
+    auto& in_lod = in->lod();
+
+    PADDLE_ENFORCE_EQ(in_lod.size(), 1UL,
+                      "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(
+        (uint64_t)in_dims[0], in_lod[0].back(),
+        "Inconsistent size between X.shape[0] and X.lod()[0].back().");
+
+    auto in_lod_l0 = in_lod[0];
+    int seq_num = in_lod_l0.size() - 1;
+
+    if (in_width == out_width) {
+      out->set_lod(in->lod());
+    } else {
+      auto& out_lod = *out->mutable_lod();
+      out_lod.resize(1);
+      out_lod[0].resize(seq_num + 1);
+      out_lod[0][0] = 0;
+      for (int i = 0; i < seq_num; ++i) {
+        size_t seq_len = in_lod_l0[i + 1] - in_lod_l0[i];
+        size_t offset = 0;
+        offset = (seq_len * in_width) / out_width;
+        PADDLE_ENFORCE_EQ(offset * out_width, seq_len * in_width,
+                          "Please make sure (sequence_length * dimension) can "
+                          "be divided by new_dim with no remainder for each "
+                          "sequence. The %dth sequence is invalid.",
+                          i + 1);
+        out_lod[0][i + 1] = out_lod[0][i] + offset;
+      }
+    }
+
+    framework::Copy(*in, context.GetPlace(), out);
+    out->Resize({static_cast<int64_t>(out->lod()[0].back()), out_width});
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SequenceReshapeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x_tensor_ptr = context.Input<LoDTensor>("X");
+    auto* outg_tensor_ptr =
+        context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* xg_tensor_ptr =
+        context.Output<LoDTensor>(framework::GradVarName("X"));
+
+    xg_tensor_ptr->mutable_data<T>(context.GetPlace());
+    framework::Copy(*outg_tensor_ptr, context.GetPlace(), xg_tensor_ptr);
+    xg_tensor_ptr->Resize(x_tensor_ptr->dims());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_slice_op.cc b/paddle/fluid/operators/sequence_slice_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..87b8eff64621290ebd75d2cb76d7c684655b884f
--- /dev/null
+++ b/paddle/fluid/operators/sequence_slice_op.cc
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_slice_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceSliceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceSliceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Offset"),
+                   "Input(Offset) of SequenceSliceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Length"),
+                   "Input(Length) of SequenceSliceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceSliceOp should not be null.");
+    auto input_dims = ctx->GetInputDim("X");
+
+    auto offset_dim = ctx->GetInputDim("Offset");
+    auto length_dim = ctx->GetInputDim("Length");
+
+    PADDLE_ENFORCE_EQ(
+        offset_dim.size(), 2UL,
+        "Only support one level sequence now, The rank of offset must be 2.");
+    PADDLE_ENFORCE_EQ(
+        length_dim.size(), 2UL,
+        "Only support one level sequence now, The rank of Length must be 2.");
+
+    // Initialize the output's dims to maximum,
+    // and re-set to real dims by the value of Offset and Length at kernel
+    ctx->SetOutputDim("Out", input_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class SequenceSliceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
+                   "The gradient of X should not be null.");
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceSliceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor), "
+             "the input of SequenceSliceOp.");
+    AddInput("Offset",
+             "(Tensor), "
+             "a vector<int> to describe the offset of every input sequence for "
+             "sub sequence item.");
+    AddInput("Length",
+             "(Tensor), "
+             "a vector<int> to describe the length of every input sequence for "
+             "sub sequence item.");
+    AddOutput("Out", "(LoDTensor), the output of SequenceSliceOp.");
+    AddComment(R"DOC(
+Sequence slice operator
+
+The operator crops a subsequence from given sequence with given start offset and subsequence length.
+It only supports sequence (LoD Tensor with level number is 1).
+- Case:
+    X = [[a1, a2;
+        b1, b2;
+        c1, c2]
+       [d1, d2;
+        e1, e2]]
+    LoD(X) = {{0, 3, 5}}; Dims(X) = (5, 2)
+    Offset = [[0], [1]]; Length = [[2], [1]]
+
+    Out = [[a1, a2;
+            b1, b2]
+            [e1, e2]]
+    LoD(Out) = {{0, 2, 3}}; Dims(Out) = (3, 2)
+NOTE: The first dimension size of input, the size of offset and Length, should be equal. The offset start from 0.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(sequence_slice, ops::SequenceSliceOp, ops::SequenceSliceOpMaker,
+            sequence_slice_grad, ops::SequenceSliceGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_slice,
+    ops::SequenceSliceOpKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_slice_grad,
+    ops::SequenceSliceGradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_slice_op.cu b/paddle/fluid/operators/sequence_slice_op.cu
new file mode 100755
index 0000000000000000000000000000000000000000..041fabdf9a2dc73540ab45e5e86aa1ef71bed4dc
--- /dev/null
+++ b/paddle/fluid/operators/sequence_slice_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_slice_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_slice,
+    ops::SequenceSliceOpKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_slice_grad,
+    ops::SequenceSliceGradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_slice_op.h b/paddle/fluid/operators/sequence_slice_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..65c36a32aa12c628db5c4f0c104d3977e625ad97
--- /dev/null
+++ b/paddle/fluid/operators/sequence_slice_op.h
@@ -0,0 +1,173 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+template <typename T>
+inline LoD SequenceSliceLoD(const T& in, const int64_t* offset_data,
+                            const int64_t* length_data) {
+  auto out_lod = in.lod();
+  size_t lod_offset = 0;
+
+  auto n = in.lod()[0].size() - 1;
+  out_lod[0][0] = 0;
+  for (size_t i = 0; i < n; ++i) {
+    lod_offset += length_data[i];
+    out_lod[0][i + 1] = lod_offset;
+  }
+  return out_lod;
+}
+
+template <typename DeviceContext, typename T>
+class SequenceSliceOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<LoDTensor>("X");
+    auto* offset = ctx.Input<Tensor>("Offset");
+    auto* length = ctx.Input<Tensor>("Length");
+    auto* out = ctx.Output<LoDTensor>("Out");
+
+    auto lod = in->lod();
+    auto n = lod[0].size() - 1;
+
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(
+        n, static_cast<size_t>(length->dims()[0]),
+        "The size of input-sequence and length-array should be the same");
+    PADDLE_ENFORCE_EQ(
+        n, static_cast<size_t>(offset->dims()[0]),
+        "The size of input-sequence and offset-array should be the same");
+
+    const int64_t* offset_data = offset->data<int64_t>();
+    const int64_t* length_data = length->data<int64_t>();
+    framework::Tensor offset_cpu;
+    framework::Tensor length_cpu;
+
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
+      framework::Copy(*offset, platform::CPUPlace(), ctx.device_context(),
+                      &offset_cpu);
+      offset_data = offset_cpu.data<int64_t>();
+
+      length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
+      framework::Copy(*length, platform::CPUPlace(), ctx.device_context(),
+                      &length_cpu);
+      length_data = length_cpu.data<int64_t>();
+    }
+
+    for (size_t i = 0; i < n; ++i) {
+      PADDLE_ENFORCE_LT(0, offset_data[i],
+                        "The offset[%d] must greater than zero.", i);
+      PADDLE_ENFORCE_LT(0, length_data[i],
+                        "The length[%d] must greater than zero.", i);
+      PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i],
+                        lod[0][i + 1], "The target tensor's length overflow.");
+    }
+
+    out->mutable_data<T>(ctx.GetPlace());
+    auto out_lod = SequenceSliceLoD(*in, offset_data, length_data);
+    auto out_dims = in->dims();
+    out_dims[0] = out_lod[0][out_lod[0].size() - 1];
+    out->Resize(out_dims);
+    out->set_lod(out_lod);
+
+    auto in_stride = framework::stride(in->dims());
+    auto out_stride = framework::stride(out->dims());
+
+    size_t out_offset = 0;
+    for (size_t i = 0; i < n; ++i) {
+      Tensor in_t = in->Slice(
+          static_cast<int>(lod[0][i] + offset_data[i]),
+          static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
+
+      StridedMemcpy<T>(ctx.device_context(), in_t.data<T>(), in_stride,
+                       in_t.dims(), out_stride, out->data<T>() + out_offset);
+      out_offset += length_data[i] * in_stride[0];
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<LoDTensor>("X");
+    auto* offset = ctx.Input<Tensor>("Offset");
+    auto* length = ctx.Input<Tensor>("Length");
+    auto* out_grad =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto* x_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+
+    const int64_t* offset_data = offset->data<int64_t>();
+    const int64_t* length_data = length->data<int64_t>();
+    framework::Tensor offset_cpu;
+    framework::Tensor length_cpu;
+
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
+      framework::Copy(*offset, platform::CPUPlace(), ctx.device_context(),
+                      &offset_cpu);
+      offset_data = offset_cpu.data<int64_t>();
+
+      length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
+      framework::Copy(*length, platform::CPUPlace(), ctx.device_context(),
+                      &length_cpu);
+      length_data = length_cpu.data<int64_t>();
+    }
+
+    auto lod = in->lod();
+    auto out_lod = out_grad->lod();
+
+    if (x_grad) {
+      x_grad->mutable_data<T>(ctx.GetPlace());
+      x_grad->set_lod(in->lod());
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(ctx.template device_context<DeviceContext>(), x_grad,
+               static_cast<T>(0));
+
+      auto out_grad_stride = framework::stride(out_grad->dims());
+
+      for (size_t i = 0; i < out_lod[0].size() - 1; ++i) {
+        Tensor out_grad_t =
+            out_grad->Slice(static_cast<int>(out_lod[0][i]),
+                            static_cast<int>(out_lod[0][i + 1]));
+        auto out_grad_stride = framework::stride(out_grad_t.dims());
+
+        auto x_grad_stride = framework::stride(x_grad->dims());
+
+        Tensor x_grad_t = x_grad->Slice(
+            static_cast<int>(lod[0][i] + offset_data[i]),
+            static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
+
+        StridedMemcpy<T>(ctx.device_context(), out_grad_t.data<T>(),
+                         out_grad_stride, out_grad_t.dims(), x_grad_stride,
+                         x_grad_t.data<T>());
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_softmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f966b7162077943dd78d601743b3a3e2e103444b
--- /dev/null
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_softmax_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceSoftmaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceSoftmaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceSoftmaxOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceSoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor) 1-D or 2-D input LoDTensor with the 2-nd dimension "
+             "of length 1.");
+    AddOutput("Out",
+              "(LoDTensor) 1-D or 2-D output LoDTensor with the 2-nd dimension "
+              "of length 1.");
+    AddComment(R"DOC(
+Sequence Softmax Operator.
+
+SequenceSoftmaxOp computes the softmax activation among all time-steps for each
+sequence. The dimension of each time-step should be 1. Thus, the shape of
+input Tensor can be either [N, 1] or [N], where N is the sum of the length
+of all sequences.
+
+The algorithm works as follows:
+
+    for i-th sequence in a mini-batch:
+
+$$
+Out(X[lod[i]:lod[i+1]], :) = \
+\frac{\exp(X[lod[i]:lod[i+1], :])} \
+{\sum(\exp(X[lod[i]:lod[i+1], :]))}
+$$
+
+For example, for a mini-batch of 3 sequences with variable-length,
+each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],
+then softmax will be computed among X[0:2, :], X[2:5, :], X[5:7, :]
+and N turns out to be 7.
+
+)DOC");
+  }
+};
+
+class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Out"),
+                   "Input(Out) of SequenceSoftmaxGradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput(framework::GradVarName("Out")),
+        "Input(Out@GRAD) of SequenceSoftmaxGradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceSoftmaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) of SequenceSoftmaxOp should not be null.");
+
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputDim("Out"),
+        ctx->GetInputDim(framework::GradVarName("Out")),
+        "Input(Out) and Input(Out@GRAD) of SequenceSoftmaxGradOp should be of "
+        "the same shape.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp,
+            ops::SequenceSoftmaxOpMaker, sequence_softmax_grad,
+            ops::SequenceSoftmaxGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_softmax,
+    ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_softmax_grad,
+    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_softmax_op.cu.cc b/paddle/fluid/operators/sequence_softmax_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c42dfd7540954616eb7bf012160a98211c3caf1b
--- /dev/null
+++ b/paddle/fluid/operators/sequence_softmax_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_softmax_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_softmax,
+    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, float>)
+REGISTER_OP_CUDA_KERNEL(
+    sequence_softmax_grad,
+    ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_softmax_op.h b/paddle/fluid/operators/sequence_softmax_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6c21c67b3362835b2ff87045a213b2636556346
--- /dev/null
+++ b/paddle/fluid/operators/sequence_softmax_op.h
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/softmax.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class SequenceSoftmaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+
+    auto lod = x->lod();
+    auto dims = x->dims();
+
+    const size_t level = lod.size() - 1;
+    PADDLE_ENFORCE_EQ(dims[0], static_cast<int64_t>(lod[level].back()),
+                      "The first dimension of Input(X) should be equal to the "
+                      "sum of all sequences' lengths.");
+    PADDLE_ENFORCE_EQ(dims[0], x->numel(),
+                      "The width of each timestep in Input(X) of "
+                      "SequenceSoftmaxOp should be 1.");
+
+    out->mutable_data<T>(ctx.GetPlace());
+    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+      Tensor x_i = x->Slice(start_pos, end_pos);
+      Tensor out_i = out->Slice(start_pos, end_pos);
+
+      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
+      framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
+      x_i.Resize(dims_i);
+      out_i.Resize(dims_i);
+      math::SoftmaxFunctor<DeviceContext, T>()(
+          ctx.template device_context<DeviceContext>(), &x_i, &out_i);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SequenceSoftmaxGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<LoDTensor>("Out");
+    auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+
+    auto lod = x->lod();
+    const size_t level = lod.size() - 1;
+
+    x_grad->mutable_data<T>(ctx.GetPlace());
+    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+
+      Tensor out_i = out->Slice(start_pos, end_pos);
+      Tensor out_grad_i = out_grad->Slice(start_pos, end_pos);
+      Tensor x_grad_i = x_grad->Slice(start_pos, end_pos);
+
+      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
+      framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
+      out_i.Resize(dims_i);
+      out_grad_i.Resize(dims_i);
+      x_grad_i.Resize(dims_i);
+      math::SoftmaxGradFunctor<DeviceContext, T>()(
+          ctx.template device_context<DeviceContext>(), &out_i, &out_grad_i,
+          &x_grad_i);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f1e23a62f4ec52b40cfa1febc98fbfb045f45efd
--- /dev/null
+++ b/paddle/fluid/operators/sgd_op.cc
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sgd_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SGDOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of SGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of SGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of SGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of SGDOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 element");
+    auto param_dim = ctx->GetInputDim("Param");
+    // TODO(qijun): check dimensions of Param and Grad at complie
+    // and run time.
+    ctx->SetOutputDim("ParamOut", param_dim);
+  }
+};
+
+class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("LearningRate", "(Tensor) Learning rate of SGD");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddComment(R"DOC(
+
+SGD operator
+
+This operator implements one step of the stochastic gradient descent algorithm.
+
+$$param\_out = param - learning\_rate * grad$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(sgd, ops::SGDOp, ops::SGDOpMaker);
+REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<float>, ops::SGDOpKernel<double>);
diff --git a/paddle/fluid/operators/sgd_op.cu b/paddle/fluid/operators/sgd_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..09374e20494be2eebba913bd90a7c32e1aa0015b
--- /dev/null
+++ b/paddle/fluid/operators/sgd_op.cu
@@ -0,0 +1,118 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/sgd_op.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+namespace {
+
+template <typename T>
+__global__ void SGDKernel(const T* g, const T* p, const T* learning_rate,
+                          const int num, T* p_out) {
+  T lr = learning_rate[0];
+  int grid_size = blockDim.x * gridDim.x;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += grid_size) {
+    T g_data = g[i];
+    T p_data = p[i];
+    p_out[i] = p_data - lr * g_data;
+  }
+}
+
+template <typename T, int block_size>
+__global__ void SparseSGDFunctorKernel(const T* selected_rows,
+                                       const int64_t* rows,
+                                       const T* learning_rate, T* tensor_out,
+                                       int64_t row_numel) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  selected_rows += ty * row_numel;
+  tensor_out += rows[ty] * row_numel;
+
+  for (int index = tid; index < row_numel; index += block_size) {
+    // Since index in rows of SelectedRows can be duplicate, we have to use
+    // Atomic Operation to avoid concurrent write error.
+    paddle::platform::CudaAtomicAdd(
+        tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]);
+  }
+}
+}  // namespace
+
+template <typename T>
+class SGDOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param = ctx.Input<framework::Tensor>("Param");
+    auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
+    auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    auto* grad_var = ctx.InputVar("Grad");
+    // Actually, all tensors are LoDTensor except SelectedRows.
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      param_out->mutable_data<T>(ctx.GetPlace());
+      auto* grad = ctx.Input<framework::Tensor>("Grad");
+      auto* grad_data = grad->data<T>();
+      auto* param_data = param->data<T>();
+      auto* param_out_data = param_out->data<T>();
+
+      int block = 512;
+      int grid = (param->numel() + block - 1) / block;
+
+      SGDKernel<T><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
+          grad_data, param_data, learning_rate->data<T>(), param->numel(),
+          param_out_data);
+
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
+      // This manual optimization brings difficulty to track data dependency.
+      // It's better to find a more elegant solution.
+      PADDLE_ENFORCE_EQ(param, param_out);
+      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
+
+      auto in_height = grad->height();
+      auto out_dims = param_out->dims();
+      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
+
+      auto& in_value = grad->value();
+      framework::Vector<int64_t> in_rows(grad->rows());
+
+      int64_t in_row_numel = in_value.numel() / in_rows.size();
+      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
+
+      auto* in_data = in_value.data<T>();
+      auto* out_data = param_out->data<T>();
+
+      const int block_size = 256;
+      dim3 threads(block_size, 1);
+      dim3 grid(1, in_rows.size());
+      SparseSGDFunctorKernel<
+          T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+          in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(),
+          out_data, in_row_numel);
+
+    } else {
+      PADDLE_THROW("Unsupported Variable Type of Grad");
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(sgd, ops::SGDOpCUDAKernel<float>,
+                        ops::SGDOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f1eaaecdb1eef1b42ea5d3b7315133c665b50df6
--- /dev/null
+++ b/paddle/fluid/operators/sgd_op.h
@@ -0,0 +1,76 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class SGDOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param = ctx.Input<framework::Tensor>("Param");
+    auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
+    auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    auto* grad_var = ctx.InputVar("Grad");
+    // Actually, all tensors are LoDTensor except SelectedRows.
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      param_out->mutable_data<T>(ctx.GetPlace());
+      auto* grad = ctx.Input<framework::Tensor>("Grad");
+
+      auto p = framework::EigenVector<T>::Flatten(*param);
+      auto g = framework::EigenVector<T>::Flatten(*grad);
+      auto o = framework::EigenVector<T>::Flatten(*param_out);
+      auto* lr = learning_rate->data<T>();
+
+      o = p - lr[0] * g;
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
+      // This manual optimization brings difficulty to track data dependency.
+      // It's better to find a more elegant solution.
+      PADDLE_ENFORCE_EQ(param, param_out);
+      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
+
+      auto in_height = grad->height();
+      auto out_dims = param_out->dims();
+      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
+
+      auto& in_value = grad->value();
+      auto& in_rows = grad->rows();
+
+      int64_t in_row_numel = in_value.numel() / in_rows.size();
+      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
+
+      auto* in_data = in_value.data<T>();
+      auto* out_data = param_out->data<T>();
+      auto* lr = learning_rate->data<T>();
+
+      for (size_t i = 0; i < in_rows.size(); i++) {
+        for (int64_t j = 0; j < in_row_numel; j++) {
+          out_data[in_rows[i] * in_row_numel + j] -=
+              lr[0] * in_data[i * in_row_numel + j];
+        }
+      }
+    } else {
+      PADDLE_THROW("Unsupported Variable Type of Grad");
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..df50a324fde1637f1f9f64a0b0d4eff8ba3f26d2
--- /dev/null
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -0,0 +1,180 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/operators/array_operator.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+class ShrinkRNNMemoryOp : public ArrayOp {
+ public:
+  ShrinkRNNMemoryOp(const std::string &type,
+                    const framework::VariableNameMap &inputs,
+                    const framework::VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs)
+      : ArrayOp(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto *x_var = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE(x_var != nullptr, "Input X must be set");
+    auto &x_tensor = x_var->Get<framework::LoDTensor>();
+    size_t offset = this->GetOffset(scope, place);
+    auto *rank_table_var = scope.FindVar(Input("RankTable"));
+    PADDLE_ENFORCE(rank_table_var != nullptr, "RankTable must be set");
+    auto &rank_table = rank_table_var->Get<framework::LoDRankTable>();
+
+    auto &rank_items = rank_table.items();
+    int dst_num_rows =
+        std::lower_bound(rank_items.begin(), rank_items.end(), offset,
+                         [](const framework::LoDRankTable::TableItem &a,
+                            size_t b) { return a.length > b; }) -
+        rank_items.begin();
+
+    auto *out_var = scope.FindVar(Output("Out"));
+    PADDLE_ENFORCE(out_var != nullptr, "Output(Out) must be set.");
+    auto &out_tensor = *out_var->GetMutable<framework::LoDTensor>();
+
+    size_t height = dst_num_rows;
+
+    // do shrink for the top level LoD
+    if (x_tensor.lod().size() > 0 &&
+        x_tensor.lod()[0].size() > static_cast<size_t>(dst_num_rows)) {
+      auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(x_tensor.lod(), 0,
+                                                              dst_num_rows, 0);
+      height = lod_offset.second.second;
+      auto out_lod = out_tensor.mutable_lod();
+      framework::AppendLoD(out_lod, lod_offset.first);
+    }
+
+    if (dst_num_rows != 0) {
+      out_tensor.ShareDataWith(x_tensor.Slice(0, height));
+    }
+  }
+};
+
+class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ShrinkRNNMemoryOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) The RNN step memory to be shrinked.");
+    AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN.");
+    AddInput("I",
+             "(LoDTensor) The step index. The RNN step memory 'X' will be "
+             "shrinked to match the size of the input of the index'th step.");
+    AddOutput("Out", "(LoDTensor) The shrinked RNN step memory.");
+    AddComment(R"DOC(
+This operator is used to shrink output batch of memory defined in dynamic RNN.
+
+Dynamic RNN is able to handle variable-length sequences, in which, sequences in
+a mini-batch are sorted by their lengths first. After that, the longest sequence
+becomes the first one in the sorted batch, followed by the second longest, the
+third longest, and so on. Dynamic RNN then slices a batch input timestep by
+timestep from the sorted input. Once any sequence in the input batch reaches its
+end, memory defined in dynamicRNN has to shrink its outputs to adapt to the input
+batch size for the next time step.
+)DOC");
+  }
+};
+
+class ShrinkRNNMemoryInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"));
+    PADDLE_ENFORCE(context->HasInput("I"));
+    PADDLE_ENFORCE(context->HasInput("RankTable"));
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+  }
+};
+
+class ShrinkRNNMemoryGradOp : public ArrayOp {
+ public:
+  ShrinkRNNMemoryGradOp(const std::string &type,
+                        const framework::VariableNameMap &inputs,
+                        const framework::VariableNameMap &outputs,
+                        const framework::AttributeMap &attrs)
+      : ArrayOp(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out")));
+    auto *dx_var = scope.FindVar(Output(framework::GradVarName("X")));
+    PADDLE_ENFORCE(dx_var != nullptr, "Input Gradient should not be nullptr");
+    auto *x_var = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE(x_var != nullptr);
+
+    auto &x_tensor = x_var->Get<framework::LoDTensor>();
+    auto &dx_tensor = *dx_var->GetMutable<framework::LoDTensor>();
+    dx_tensor.Resize(x_tensor.dims());
+    dx_tensor.mutable_data(x_tensor.place(), x_tensor.type());
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    if (dout_var == nullptr) {  // dx_tensor fill zero
+      math::set_constant(dev_ctx, &dx_tensor, 0.0f);
+    } else {
+      auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
+      auto height = dout_tensor.dims()[0];
+      auto slice = dx_tensor.Slice(0, static_cast<int>(height));
+      framework::Copy(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
+      if (dx_tensor.dims()[0] > height) {
+        auto rest_tensor = dx_tensor.Slice(
+            static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));
+        math::set_constant(dev_ctx, &rest_tensor, 0.0f);
+      }
+    }
+    dx_tensor.set_lod(x_tensor.lod());
+  }
+};
+
+class ShrinkRNNMemoryGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"));
+    PADDLE_ENFORCE(context->HasOutput(framework::GradVarName("X")));
+    context->SetOutputDim(framework::GradVarName("X"),
+                          context->GetInputDim("X"));
+    context->ShareLoD("X", framework::GradVarName("X"));
+  }
+};
+
+class ShrinkRNNGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op = new framework::OpDesc();
+    op->SetType("shrink_rnn_memory_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(shrink_rnn_memory, ops::ShrinkRNNMemoryOp,
+                  ops::ShrinkRNNMemoryInferShape,
+                  ops::ShrinkRNNMemoryOpProtoMaker, ops::ShrinkRNNGradOpMaker);
+REGISTER_OPERATOR(shrink_rnn_memory_grad, ops::ShrinkRNNMemoryGradOp,
+                  ops::ShrinkRNNMemoryGradInferShape);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3188415a2bd4434704ca95b92427094023527019
--- /dev/null
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -0,0 +1,148 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto labels_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
+                      "Input(Label)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
+                      "The 1st dimension of Input(X) and Input(Label) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
+                      "The 2nd dimension of Input(X) and Input(Label) should "
+                      "be equal.");
+
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class SigmoidCrossEntropyWithLogitsGradOp
+    : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shoudl be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto labels_dims = ctx->GetInputDim("Label");
+    auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
+                      "Input(Label)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(dout_dims.size(), 2,
+                      "Input(Out@Grad)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
+                      "The 1st dimension of Input(X) and Input(Label) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
+                      "The 2nd dimension of Input(X) and Input(Label) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(x_dims[0], dout_dims[0],
+                      "The 1st dimension of Input(X) and Input(Out@Grad) "
+                      "should be equal.");
+    PADDLE_ENFORCE_EQ(x_dims[1], dout_dims[1],
+                      "The 2nd dimension of Input(X) and Input(Out@Grad) "
+                      "should be equal.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  }
+};
+
+class SigmoidCrossEntropyWithLogitsOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  SigmoidCrossEntropyWithLogitsOpMaker(OpProto* proto,
+                                       OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "
+             "where N is the batch size and D is the number of classes. "
+             "This input is a tensor of logits computed by the previous "
+             " operator. Logits are unscaled log probabilities given as "
+             "log(p/(1-p)).");
+    AddInput("Label",
+             "(Tensor, default Tensor<float>), a 2-D tensor of the same type "
+             "and shape as X. This input is a tensor of probabalistic labels "
+             "for each logit");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D "
+              " of elementwise logistic losses.");
+    AddComment(R"DOC(
+SigmoidCrossEntropyWithLogits Operator.
+
+This measures the element-wise probability error in classification tasks
+in which each class is independent. This can be thought of as predicting labels
+for a data-point, where labels are not mutually exclusive.
+For example, a news article can be about politics, technology or sports
+at the same time or none of these.
+
+The logistic loss is given as follows:
+
+       $$loss = -Labels * \log(\sigma(X)) - (1 - Labels) * \log(1 - \sigma(X))$$
+
+We know that $$\sigma(X) = (1 / (1 + \exp(-X)))$$. By substituting this we get:
+
+       $$loss = X - X * Labels + \log(1 + \exp(-X))$$
+
+For stability and to prevent overflow of $$\exp(-X)$$ when X < 0,
+we reformulate the loss as follows:
+
+       $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-|X|))$$
+
+Both the input `X` and `Labels` can carry the LoD (Level of Details) information.
+However the output only shares the LoD with input `X`.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(sigmoid_cross_entropy_with_logits,
+            ops::SigmoidCrossEntropyWithLogitsOp,
+            ops::SigmoidCrossEntropyWithLogitsOpMaker,
+            sigmoid_cross_entropy_with_logits_grad,
+            ops::SigmoidCrossEntropyWithLogitsGradOp);
+REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits,
+                       ops::SigmoidCrossEntropyWithLogitsKernel<
+                           paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
+                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
+                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..daa9d3e4fa5aeba77f770f69d6057ced98741eaa
--- /dev/null
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits,
+                        ops::SigmoidCrossEntropyWithLogitsKernel<
+                            paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad,
+                        ops::SigmoidCrossEntropyWithLogitsGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..977849f7627bc3f5b08a5f34bd300ab1442c6276
--- /dev/null
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
+template <typename DeviceContext, typename T>
+class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const framework::Tensor *X = context.Input<framework::Tensor>("X");
+    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
+    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
+    Out->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto labels = framework::EigenVector<T>::Flatten(*Labels);
+    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto &place = *context.device_context<DeviceContext>().eigen_device();
+
+    // term1 = max(x, 0)
+    auto term1 = x.cwiseMax(static_cast<T>(0));
+    // term2 = x * labels
+    auto term2 = x * labels;
+    // term3 = log(1 + exp(-abs(x)))
+    auto term3 = (static_cast<T>(1) + (-(x.abs())).exp()).log();
+
+    out.device(place) = term1 - term2 + term3;
+  }
+};
+
+// dX = sigmoid(X) - labels
+template <typename DeviceContext, typename T>
+class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const framework::Tensor *X = context.Input<framework::Tensor>("X");
+    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
+    const framework::Tensor *dOut =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor *dX =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    dX->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto labels = framework::EigenVector<T>::Flatten(*Labels);
+    auto dout = framework::EigenVector<T>::Flatten(*dOut);
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto &place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    auto sigmoid_x = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+    dx.device(place) = dout * (sigmoid_x - labels);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54b962538b8426141c9ab1b9269c0ed8bd5a8496
--- /dev/null
+++ b/paddle/fluid/operators/sign_op.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sign_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SignOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SignOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SignOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+template <typename AttrType>
+class SignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor of sign operator.");
+    AddOutput("Out", "(Tensor) Output tensor of sign operator.");
+    AddComment(R"DOC(
+Sign operator
+
+$$Out = X.sign()$$
+)DOC");
+  }
+};
+
+class SignGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("scale");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("scale", 0.0f);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker<float>,
+                  ops::SignGradMaker);
+REGISTER_OP_CPU_KERNEL(
+    sign, ops::SignKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sign_op.cu b/paddle/fluid/operators/sign_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..93cdb311eb4961a7754f9adfe14a15f3b2d0ca58
--- /dev/null
+++ b/paddle/fluid/operators/sign_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sign_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    sign,
+    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c2ebebee40d9b64dd8b658b904e631ba294e41e
--- /dev/null
+++ b/paddle/fluid/operators/sign_op.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class SignKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    out->mutable_data<T>(in->place());
+
+    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    eigen_out.device(place) = eigen_in.sign();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cc b/paddle/fluid/operators/smooth_l1_loss_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..be4c7a56a84e84c39a578b958fe7c9ad551f54f6
--- /dev/null
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
@@ -0,0 +1,144 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/smooth_l1_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SmoothL1LossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(x_dims, y_dims);
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "The tensor rank of Input(X) should not be less than 2.");
+    if (ctx->HasInput("InsideWeight")) {
+      PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"),
+                     "If weights are provided, must specify both "
+                     "inside and outside weights.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims);
+    }
+
+    ctx->SetOutputDim("Diff", x_dims);
+    // loss is a two-rank tensor
+    ctx->SetOutputDim("Out", {x_dims[0], 1});
+  }
+};
+
+template <typename AttrType>
+class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SmoothL1LossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "The input value of smooth l1 loss op with shape "
+             "[batch_size, dim1, ..., dimN].");
+    AddInput("Y",
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "The target value of smooth l1 loss op with same shape as X.");
+    AddInput("InsideWeight",
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "This input is optional and should have same shape with X. "
+             "If provided, the result of (X - Y) will be multiplied "
+             "by this tensor element by element.")
+        .AsDispensable();
+    AddInput("OutsideWeight",
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "This input is optional and should have same shape with X. "
+             "If provided, the out smooth l1 loss will be multiplied by this "
+             "tensor element by element.")
+        .AsDispensable();
+    AddOutput("Diff", "Intermediate variable to cache InsideWeight * (X - Y).")
+        .AsIntermediate();
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>) A tensor with rank be 2. "
+              "The output smooth l1 loss with shape [batch_size, 1].");
+    AddAttr<AttrType>("sigma",
+                      "Hyper parameter of smooth l1 loss op."
+                      "A float scalar with default value 3.0.")
+        .SetDefault(3.0);
+    AddComment(R"DOC(
+Smooth L1 Loss Operator.
+
+This operator computes the smooth l1 loss for X and Y.
+The operator takes the first dimension of X and Y as batch size.
+For each instance, it computes the smooth l1 loss element by element first
+and then sums all the losses. So the shape of Out is [batch_size, 1].
+
+The equation is:
+$$
+Out_{\sigma}(X, Y)_i = \begin{cases}
+0.5 * (\sigma * (X_i - Y_i)) ^ 2
+\quad |X_i - Y_i| \lt \frac{1} {{\sigma} ^ 2} \\
+\frac{|X_i - Y_i| - 0.5}{{\sigma}^2},
+\quad otherwise
+\end{cases}
+$$
+
+In the above equation, $Out_{\sigma}(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
+element of Out, X and Y.
+
+)DOC");
+  }
+};
+
+class SmoothL1LossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto in_dims = ctx->GetInputDim("X");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_GE(out_dims.size(), 2,
+                      "The tensor rank of Input(Out@Grad) should be 2.");
+    PADDLE_ENFORCE_EQ(out_dims[0], in_dims[0],
+                      "The 1st dimension of Input(Out@Grad) must be "
+                      "same as input.");
+    PADDLE_ENFORCE_EQ(out_dims[1], 1,
+                      "The 2nd dimension of Input(Out@Grad) must be 1.");
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, in_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, in_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(smooth_l1_loss, ops::SmoothL1LossOp,
+            ops::SmoothL1LossOpMaker<float>, smooth_l1_loss_grad,
+            ops::SmoothL1LossGradOp);
+REGISTER_OP_CPU_KERNEL(
+    smooth_l1_loss,
+    ops::SmoothL1LossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    smooth_l1_loss_grad,
+    ops::SmoothL1LossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cu b/paddle/fluid/operators/smooth_l1_loss_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..94c0d6cd299075541f0ef66cbc0bd48a8f4d51b3
--- /dev/null
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/fluid/operators/smooth_l1_loss_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    smooth_l1_loss,
+    ops::SmoothL1LossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    smooth_l1_loss_grad,
+    ops::SmoothL1LossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.h b/paddle/fluid/operators/smooth_l1_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..325ad824e1874281873b5e41ab62db0fa43040d0
--- /dev/null
+++ b/paddle/fluid/operators/smooth_l1_loss_op.h
@@ -0,0 +1,184 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+struct SmoothL1LossForward {
+  HOSTDEVICE SmoothL1LossForward(const T& sigma2) : sigma2(sigma2) {}
+
+  HOSTDEVICE T operator()(const T& val) const {
+    T abs_val = std::abs(val);
+    if (abs_val < 1.0 / sigma2) {
+      return 0.5 * val * val * sigma2;
+    } else {
+      return abs_val - 0.5 / sigma2;
+    }
+  }
+
+  T sigma2;
+};
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class SmoothL1LossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto* in1 = context.Input<Tensor>("Y");
+    auto* in2 = context.Input<Tensor>("InsideWeight");
+    auto* in3 = context.Input<Tensor>("OutsideWeight");
+    auto* out0 = context.Output<Tensor>("Diff");
+    auto* out1 = context.Output<Tensor>("Out");
+
+    out0->mutable_data<T>(context.GetPlace());
+    out1->mutable_data<T>(context.GetPlace());
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+
+    auto sigma = static_cast<T>(context.Attr<AttrType>("sigma"));
+    T sigma2 = sigma * sigma;
+    bool has_weight = (in2 != nullptr) && (in3 != nullptr);
+
+    auto x = EigenVector<T>::Flatten(*in0);
+    auto y = EigenVector<T>::Flatten(*in1);
+    auto diff = EigenVector<T>::Flatten(*out0);
+
+    diff.device(*place) = x - y;
+    // multiply inside weight
+    if (has_weight) {
+      auto inside_weight = EigenVector<T>::Flatten(*in2);
+      // cache diff, reused in bp
+      diff.device(*place) = diff * inside_weight;
+    }
+
+    auto in_counts = in0->numel();
+    Tensor ptensor_errors;
+    ptensor_errors.mutable_data<T>({static_cast<int>(in_counts)},
+                                   context.GetPlace());
+    auto errors = EigenVector<T>::Flatten(ptensor_errors);
+    // apply smooth l1 forward
+    errors.device(*place) = diff.unaryExpr(SmoothL1LossForward<T>(sigma2));
+
+    // multiply outside weight
+    if (has_weight) {
+      auto outside_weight = EigenVector<T>::Flatten(*in3);
+      errors.device(*place) = errors * outside_weight;
+    }
+    auto loss = EigenVector<T>::Flatten(*out1);
+    // first dimension of 'X' is the number of samples
+    auto mat_dims =
+        framework::make_ddim({static_cast<int>(in0->dims()[0]),
+                              static_cast<int>(in_counts / in0->dims()[0])});
+    auto errors_mat_view = EigenMatrix<T>::From(ptensor_errors, mat_dims);
+    loss.device(*place) = errors_mat_view.sum(Eigen::array<int, 1>({{1}}));
+  }
+};
+
+template <typename T>
+struct SmoothL1LossBackward {
+  HOSTDEVICE SmoothL1LossBackward(const T& sigma2) : sigma2(sigma2) {}
+
+  HOSTDEVICE T operator()(const T& val) const {
+    T abs_val = std::abs(val);
+    if (abs_val < 1.0 / sigma2) {
+      return sigma2 * val;
+    } else {
+      return (0 < val) - (val < 0);
+    }
+  }
+
+  T sigma2;
+};
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class SmoothL1LossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("InsideWeight");
+    auto* in1 = context.Input<Tensor>("OutsideWeight");
+    auto* in2 = context.Input<Tensor>("Diff");
+    auto* og = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto sigma = static_cast<T>(context.Attr<AttrType>("sigma"));
+    T sigma2 = sigma * sigma;
+    bool has_weight = (in0 != nullptr) && (in1 != nullptr);
+
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+
+    auto in_dims = in2->dims();
+    auto counts = in2->numel();
+    auto cols = counts / in_dims[0];
+    auto mat_dims = framework::make_ddim(
+        {static_cast<int>(in_dims[0]), static_cast<int>(cols)});
+
+    Tensor ptensor_diff;
+    ptensor_diff.mutable_data<T>({static_cast<int>(counts)},
+                                 context.GetPlace());
+    auto diff = EigenVector<T>::Flatten(ptensor_diff);
+    // apply smooth l1 backwoard
+    diff.device(*place) = EigenVector<T>::Flatten(*in2).unaryExpr(
+        SmoothL1LossBackward<T>(sigma2));
+
+    // compute weights
+    Tensor ptensor_weights;
+    ptensor_weights.mutable_data<T>(mat_dims, context.GetPlace());
+    auto weights = EigenMatrix<T>::From(ptensor_weights);
+    // initialize to 1.0
+    weights.device(*place) = weights.constant(static_cast<T>(1.0));
+    if (has_weight) {
+      auto inside_weight = EigenMatrix<T>::From(*in0, mat_dims);
+      auto outside_weight = EigenMatrix<T>::From(*in1, mat_dims);
+      weights.device(*place) = inside_weight * outside_weight;
+    }
+
+    // compute gradients
+    auto out_grad = EigenMatrix<T>::From(*og);
+    auto diff_mat_view = EigenMatrix<T>::From(ptensor_diff, mat_dims);
+    auto gradients = out_grad.broadcast(
+                         Eigen::array<int, 2>({{1, static_cast<int>(cols)}})) *
+                     weights * diff_mat_view;
+
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* out1 = context.Output<Tensor>(framework::GradVarName("Y"));
+
+    if (out0) {
+      out0->mutable_data<T>(context.GetPlace());
+      auto x_grad = EigenMatrix<T>::From(*out0, mat_dims);
+      x_grad.device(*place) = gradients;
+    }
+
+    if (out1) {
+      out1->mutable_data<T>(context.GetPlace());
+      auto y_grad = EigenMatrix<T>::From(*out1, mat_dims);
+      y_grad.device(*place) = -1 * gradients;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d9462d08b9cc06df2d0dca568dbbe1c50dc948f
--- /dev/null
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/softmax_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SoftmaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SoftmaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SoftmaxOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(x_dims.size() == 2UL,
+                   "The input of softmax op must be a matrix.");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input tensor of softmax. "
+             "2-D with shape [batch_size, input_feature_dimensions].");
+    AddOutput("Out", "The normalized values with the same shape as X.");
+    AddComment(R"DOC(
+Softmax Operator.
+
+The input of the softmax operator is a 2-D tensor with shape N x K (N is the
+batch_size, K is the dimension of input feature). The output tensor has the
+same shape as the input tensor.
+
+For each row of the input tensor, the softmax operator squashes the
+K-dimensional vector of arbitrary real values to a K-dimensional vector of real
+values in the range [0, 1] that add up to 1.
+It computes the exponential of the given dimension and the sum of exponential
+values of all the other dimensions in the K-dimensional vector input.
+Then the ratio of the exponential of the given dimension and the sum of
+exponential values of all the other dimensions is the output of the softmax
+operator.
+
+For each row $i$ and each column $j$ in Input(X), we have:
+    $$Out[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$
+
+)DOC");
+  }
+};
+
+class SoftmaxOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should be not null.");
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Out"),
+                      ctx->GetInputDim(framework::GradVarName("Out")),
+                      "Input(Out) and its gradients should have a same shape.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, softmax_grad,
+            ops::SoftmaxOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    softmax_grad,
+    ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/softmax_op.cu.cc b/paddle/fluid/operators/softmax_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c53d8a2bc82dcfafc178b67299769b2e06109eb3
--- /dev/null
+++ b/paddle/fluid/operators/softmax_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/softmax_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    softmax, ops::SoftmaxKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    softmax_grad,
+    ops::SoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9287f0231031675b09c941f19c1df1fefc993506
--- /dev/null
+++ b/paddle/fluid/operators/softmax_op.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/softmax.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class SoftmaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<Tensor>("X");
+    auto* Out = context.Output<Tensor>("Out");
+
+    // allocate memory on device.
+    Out->mutable_data<T>(context.GetPlace());
+
+    math::SoftmaxFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), X, Out);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SoftmaxGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* Out = context.Input<Tensor>("Out");
+    auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
+
+    // allocate memory on device.
+    dX->mutable_data<T>(context.GetPlace());
+
+    math::SoftmaxGradFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), Out, dOut, dX);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..79d56cb97d38ebd725668442c29229ef22f5b05e
--- /dev/null
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -0,0 +1,204 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SoftmaxWithCrossEntropyOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftmaxWithCrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Logits",
+             "(Tensor, default: Tensor<float>), The unscaled log probabilities "
+             "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
+             "and K is the class number.");
+    AddInput("Label",
+             "(Tensor) The ground truth which is a 2-D tensor. If soft_label "
+             "is set to false, Label is a Tensor<int64> with shape [N x 1]. If "
+             "soft_label is set to true, Label is a Tensor<float/double> with "
+             "shape [N x K].");
+    AddOutput(
+        "Softmax",
+        "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N x K]. "
+        "The outputs value of softmax activation by given the input batch, "
+        "which will be used in backward calculation.")
+        .AsIntermediate();
+    AddOutput("Loss",
+              "(Tensor, default: Tensor<float>), A 2-D tensor. The cross "
+              "entropy loss with shape [N x 1].");
+    AddAttr<bool>(
+        "soft_label",
+        "(bool, default: false), A flag to indicate whether to interpretate "
+        "the given labels as soft labels.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+Softmax With Cross Entropy Operator.
+
+Cross entropy loss with softmax is used as the output layer extensively. This
+operator computes the softmax normalized values for each row of the input
+tensor, after which cross-entropy loss is computed. This provides a more
+numerically stable gradient.
+
+Because this operator performs a softmax on logits internally, it expects
+unscaled logits. This operator should not be used with the output of
+softmax operator since that would produce incorrect results.
+
+When the attribute soft_label is set false, this operators expects mutually
+exclusive hard labels, each sample in a batch is in exactly one class with a
+probability of 1.0. Each sample in the batch will have a single label.
+
+The equation is as follows:
+
+1) Hard label (one-hot label, so every sample has exactly one class)
+
+$$Loss_j =  -\text{Logit}_{Label_j} +
+\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right),
+j = 1,..., K$$
+
+2) Soft label (each sample can have a distribution over all classes)
+
+$$Loss_j =  -\sum_{i=0}^{K}\text{Label}_i \left(\text{Logit}_i -
+\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right),
+j = 1,...,K$$
+
+)DOC");
+  }
+};
+
+class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("Softmax"),
+                   "Output(Softmax) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Loss"), "Output(Loss) should be not null.");
+
+    auto logits_dims = ctx->GetInputDim("Logits");
+    auto labels_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE_EQ(
+        logits_dims.size(), 2UL,
+        "The input of softmax_with_cross_entropy should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
+                      "The labels should be a 2-D tensor.");
+
+    if (ctx->Attrs().Get<bool>("soft_label")) {
+      PADDLE_ENFORCE_EQ(logits_dims[1], labels_dims[1],
+                        "If Attr(soft_label) == true, the 2nd dimension of "
+                        "Input(X) and Input(Label) should be equal.");
+    } else {
+      PADDLE_ENFORCE_EQ(labels_dims[1], 1UL,
+                        "If Attr(soft_label) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
+    }
+
+    ctx->SetOutputDim("Softmax", logits_dims);
+    ctx->SetOutputDim("Loss", {logits_dims[0], 1});
+
+    ctx->ShareLoD("Logits", /*->*/ "Softmax");
+    ctx->ShareLoD("Logits", /*->*/ "Loss");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
+        ctx.device_context());
+  }
+};
+
+class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@Grad) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Softmax"),
+                   "Input(Softmax) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
+                   "Output(Logits@Grad) should be not null.");
+
+    auto softmax_dims = ctx->GetInputDim("Softmax");
+    auto labels_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
+                      "The labels should be a 2-D tensor.");
+
+    if (ctx->Attrs().Get<bool>("soft_label")) {
+      PADDLE_ENFORCE_EQ(softmax_dims[1], labels_dims[1],
+                        "When Attr(soft_label) == true, the 2nd dimension of "
+                        "Input(X) and Input(Label) should be equal.");
+    } else {
+      PADDLE_ENFORCE_EQ(labels_dims[1], 1UL,
+                        "When Attr(soft_label) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
+    }
+
+    ctx->SetOutputDim(framework::GradVarName("Logits"),
+                      ctx->GetInputDim("Softmax"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<Tensor>(framework::GradVarName("Loss"))->type()),
+        ctx.device_context());
+  }
+};
+
+class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
+    grad_op->SetType("softmax_with_cross_entropy_grad");
+    grad_op->SetInput("Label", Input("Label"));
+    grad_op->SetInput("Softmax", Output("Softmax"));
+    grad_op->SetInput("Loss", Output("Loss"));
+    grad_op->SetInput(framework::GradVarName("Softmax"), OutputGrad("Softmax"));
+    grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
+    grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp,
+                  ops::SoftmaxWithCrossEntropyOpMaker, ops::SoftmaxGradMaker);
+REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
+                  ops::SoftmaxWithCrossEntropyOpGrad);
+REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy,
+                       ops::SoftmaxWithCrossEntropyKernel<float>,
+                       ops::SoftmaxWithCrossEntropyKernel<double>);
+REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad,
+                       ops::SoftmaxWithCrossEntropyGradKernel<float>,
+                       ops::SoftmaxWithCrossEntropyGradKernel<double>);
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..410d9e8887c593249495e08424467b4be15c9bcb
--- /dev/null
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -0,0 +1,126 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+namespace {
+template <typename T>
+__global__ void CrossEntropyGrad(T* logit_grad, const T* loss_grad,
+                                 const int64_t* labels, const int batch_size,
+                                 const int class_num) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int sample_idx = tid / class_num;
+
+  if (tid < batch_size) {
+    PADDLE_ASSERT(labels[sample_idx] >= 0 && labels[sample_idx] < class_num);
+    logit_grad[tid * class_num + labels[tid]] -= static_cast<T>(1.);
+  }
+
+  __syncthreads();
+
+  if (tid < batch_size * class_num) {
+    logit_grad[tid] *= loss_grad[sample_idx];
+  }
+}
+
+template <typename T>
+__global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
+                                               const T* loss_grad,
+                                               const T* labels,
+                                               const int batch_size,
+                                               const int class_num) {
+  int ids = blockIdx.x * blockDim.x + threadIdx.x;
+  if (ids < batch_size * class_num) {
+    int row_ids = ids / class_num;
+    logit_grad[ids] = loss_grad[row_ids] * (logit_grad[ids] - labels[ids]);
+  }
+}
+}  // namespace
+
+template <typename T>
+class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    const Tensor* logits = context.Input<Tensor>("Logits");
+    const Tensor* labels = context.Input<Tensor>("Label");
+    Tensor* softmax = context.Output<Tensor>("Softmax");
+
+    Tensor* loss = context.Output<Tensor>("Loss");
+    softmax->mutable_data<T>(context.GetPlace());
+    loss->mutable_data<T>(context.GetPlace());
+
+    math::SoftmaxFunctor<platform::CUDADeviceContext, T>()(
+        context.cuda_device_context(), logits, softmax);
+    math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
+        context.cuda_device_context(), loss, softmax, labels,
+        context.Attr<bool>("soft_label"));
+  }
+};
+
+template <typename T>
+class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    const Tensor* labels = context.Input<Tensor>("Label");
+    const T* loss_grad_data =
+        context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
+    Tensor* logit_grad =
+        context.Output<Tensor>(framework::GradVarName("Logits"));
+    logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
+    T* logit_grad_data = logit_grad->data<T>();
+
+    const int batch_size = logit_grad->dims()[0];
+    const int class_num = logit_grad->dims()[1];
+    int block = 512;
+    int grid = (batch_size * class_num + block - 1) / block;
+
+    if (context.Attr<bool>("soft_label")) {
+      const T* label_data = labels->data<T>();
+      SoftCrossEntropyGradientKernel<
+          T><<<grid, block, 0,
+               context.template device_context<platform::CUDADeviceContext>()
+                   .stream()>>>(logit_grad_data, loss_grad_data, label_data,
+                                batch_size, class_num);
+    } else {
+      const int64_t* label_data = labels->data<int64_t>();
+      CrossEntropyGrad<
+          T><<<grid, block, 0,
+               context.template device_context<platform::CUDADeviceContext>()
+                   .stream()>>>(logit_grad_data, loss_grad_data, label_data,
+                                batch_size, class_num);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy,
+                        ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
+                        ops::SoftmaxWithCrossEntropyCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy_grad,
+                        ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
+                        ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..0927efd42ceb35cc4183f84d160c44f35f6cc3f5
--- /dev/null
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/cross_entropy.h"
+#include "paddle/fluid/operators/math/softmax.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()),
+                   "This kernel only runs on CPU.");
+    const Tensor* logits = context.Input<Tensor>("Logits");
+    const Tensor* labels = context.Input<Tensor>("Label");
+    Tensor* softmax = context.Output<Tensor>("Softmax");
+    Tensor* loss = context.Output<Tensor>("Loss");
+
+    softmax->mutable_data<T>(context.GetPlace());
+    loss->mutable_data<T>(context.GetPlace());
+
+    auto& dev_ctx =
+        context.template device_context<platform::CPUDeviceContext>();
+    math::SoftmaxFunctor<platform::CPUDeviceContext, T>()(dev_ctx, logits,
+                                                          softmax);
+    math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
+        dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"));
+  }
+};
+
+template <typename T>
+class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Loss"));
+    const Tensor* labels = context.Input<Tensor>("Label");
+    Tensor* logit_grad =
+        context.Output<Tensor>(framework::GradVarName("Logits"));
+    logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
+
+    const int class_num = logit_grad->dims()[1];
+    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
+    auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
+    auto& place = *context.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
+    if (context.Attr<bool>("soft_label")) {
+      auto lbl_mat = EigenMatrix<T>::From(*labels);
+      logit_grad_mat.device(place) =
+          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) *
+          (logit_grad_mat - lbl_mat);
+    } else {
+      logit_grad_mat.device(place) =
+          logit_grad_mat *
+          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num));
+
+      const int batch_size = logit_grad->dims()[0];
+      const int64_t* label_data = labels->data<int64_t>();
+      T* logit_grad_data = logit_grad->data<T>();
+      const T* out_grad_data = out_grad->data<T>();
+      for (int i = 0; i < batch_size; ++i) {
+        logit_grad_data[i * class_num + label_data[i]] -= out_grad_data[i];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f821dc54d7bbe697d3642e64dc1628ec7d966592
--- /dev/null
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -0,0 +1,190 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+struct CopyRange {
+  size_t begin;
+  size_t end;
+};
+
+using LoD = framework::LoD;
+
+class SplitLoDTensorOp : public framework::OperatorBase {
+ public:
+  SplitLoDTensorOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
+    auto *out_true =
+        scope.FindVar(Output("OutTrue"))->GetMutable<framework::LoDTensor>();
+    auto *out_false =
+        scope.FindVar(Output("OutFalse"))->GetMutable<framework::LoDTensor>();
+    auto level = static_cast<size_t>(Attr<int>("level"));
+    auto &x_lod = x.lod();
+    auto &mask_dim = mask.dims();
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+
+    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
+    if (platform::is_cpu_place(mask.place())) {
+      cpu_mask->ShareDataWith(mask);
+    } else if (platform::is_gpu_place(mask.place())) {
+#ifdef PADDLE_WITH_CUDA
+      framework::Copy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
+#else
+      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+#endif
+    }
+    auto *mask_data = cpu_mask->data<bool>();
+
+    std::vector<std::vector<CopyRange>> copy_ranges(mask_dim[0]);
+
+    // set out_true/out_false lod
+    for (size_t t = 0; t < 2; t++) {
+      LoD *lod = nullptr;
+      if (t == 0) {
+        lod = out_false->mutable_lod();
+      } else {
+        lod = out_true->mutable_lod();
+      }
+      lod->clear();
+      for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
+        if (static_cast<size_t>(mask_data[i]) == t) {
+          size_t start_idx = i;
+          auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+              x_lod, start_idx, start_idx + 1, level);
+
+          auto &lod_length = lod_and_offset.first;
+          framework::AppendLoD(lod, lod_length);
+
+          size_t start_offset = lod_and_offset.second.first;
+          size_t end_offset = lod_and_offset.second.second;
+          copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
+        }
+      }
+    }
+
+    for (size_t t = 0; t < 2; ++t) {
+      framework::LoDTensor *out;
+      if (t == 0) {
+        out = out_false;
+      } else {
+        out = out_true;
+      }
+      auto &ranges = copy_ranges[t];
+      size_t height = std::accumulate(
+          ranges.begin(), ranges.end(), 0UL,
+          [](size_t a, const CopyRange &b) { return a + b.end - b.begin; });
+      auto x_dim = x.dims();
+      x_dim[0] = static_cast<int64_t>(height);
+      out->Resize(x_dim);
+      out->mutable_data(x.place(), x.type());
+      size_t offset = 0;
+      for (auto &each_range : ranges) {
+        size_t len = each_range.end - each_range.begin;
+        if (len == 0) {
+          continue;
+        }
+        // out[offset: offset+len] = x[each_range.begin: each_range.end]
+        auto slice = out->Slice(static_cast<int>(offset),
+                                static_cast<int>(offset + len));
+        framework::Copy(x.Slice(static_cast<int>(each_range.begin),
+                                static_cast<int>(each_range.end)),
+                        x.place(), dev_ctx, &slice);
+        offset += len;
+      }
+    }
+  }
+};
+
+class SplitLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input LoDTensor");
+    AddInput("Mask", "A bool column vector which mask the input");
+    AddOutput("OutTrue", "True branch of input LoDTensor");
+    AddOutput("OutFalse", "False branch of input LoDTensor");
+    AddAttr<int>("level", "(int) the specific lod level to split.")
+        .SetDefault(0)
+        .EqualGreaterThan(0);
+    AddComment(
+        R"DOC(
+        Split a LoDTensor with a Mask at certain level. The input LoDTensor
+        has 3 sequence at certain lod level. The Mask is a bool column vector,
+        such as [0, 1, 0] at the same level. The first and third sequence will
+        be send to False Output LoDTensor; whereas the second sequence will
+        be send to True Output LoDTensor. Please refer to MergeLoDTensorOp.)DOC");
+  }
+};
+
+class SplitLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "SplitLoDTensorOp must has input X.");
+    PADDLE_ENFORCE(context->HasInput("Mask"),
+                   "SplitLoDTensorOp must has input Mask.");
+    PADDLE_ENFORCE(context->HasOutput("OutTrue"),
+                   "SplitLoDTensorOp must has output OutTrue.");
+    PADDLE_ENFORCE(context->HasOutput("OutFalse"),
+                   "SplitLoDTensorOp must has output OutFalse.");
+
+    auto mask_dim = context->GetInputDim("Mask");
+    PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
+    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+
+    context->SetOutputDim("OutTrue", context->GetInputDim("X"));
+    context->SetOutputDim("OutFalse", context->GetInputDim("X"));
+  }
+};
+
+class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("merge_lod_tensor");
+    grad_op->SetInput("InTrue", OutputGrad("OutTrue"));
+    grad_op->SetInput("InFalse", OutputGrad("OutFalse"));
+    grad_op->SetInput("Mask", Input("Mask"));
+    grad_op->SetInput("X", Input("X"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(split_lod_tensor, ops::SplitLoDTensorOp,
+                  ops::SplitLoDTensorOpProtoMaker,
+                  ops::SplitLoDTensorInferShape,
+                  ops::SplitLoDTensorArrayGradMaker);
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f8bc22fe1d3d24866b5bf2506ed0ff585d259cc2
--- /dev/null
+++ b/paddle/fluid/operators/split_op.cc
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/split_op.h"
+#include "paddle/fluid/operators/net_op.h"
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+class SplitOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SplitOp should not be null.");
+    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
+                      "Outputs(Out) of SplitOp should not be empty.");
+    auto in_dims = ctx->GetInputDim("X");
+    auto outs_names = ctx->Outputs("Out");
+    size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
+    size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
+    std::vector<int> sections = static_cast<std::vector<int>>(
+        ctx->Attrs().Get<std::vector<int>>("sections"));
+    const size_t outs_number = outs_names.size();
+    std::vector<framework::DDim> outs_dims;
+    outs_dims.reserve(outs_number);
+
+    if (num > 0) {
+      int64_t in_axis_dim = in_dims[axis];
+      PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
+                        "tensor split does not result"
+                        " in an equal division");
+      size_t out_axis_dim = in_axis_dim / num;
+      for (size_t i = 0; i < outs_number; ++i) {
+        auto dim = in_dims;
+        dim[axis] = out_axis_dim;
+        outs_dims.push_back(dim);
+      }
+    } else if (sections.size() > 0) {
+      PADDLE_ENFORCE_EQ(sections.size(), outs_number,
+                        "tensor split sections size"
+                        "should be equal to output size.");
+      for (size_t i = 0; i < outs_number; ++i) {
+        auto dim = in_dims;
+        dim[axis] = sections[i];
+        outs_dims.push_back(dim);
+      }
+    }
+    ctx->SetOutputsDim("Out", outs_dims);
+    if (axis != 0) {
+      // Only pass LoD when not spliting along the first dim.
+      for (size_t i = 0; i < outs_number; ++i) {
+        ctx->ShareLoD("X", "Out", 0, i);
+      }
+    }
+  }
+};
+
+class SplitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor of the split operator.");
+    AddOutput("Out", "(Tensor) Output tensors of the split operator.")
+        .AsDuplicable();
+    AddComment(R"DOC(
+Split operator
+
+This operator splits the input tensor into multiple sub-tensors.
+
+Example:
+  Input = [[1,2],
+           [3,4],
+           [5,6]]
+  sections = [2,1]
+  axis = 0
+  Output[0] = [[1,2],
+               [3,4]]
+  Output[1] = [[5,6]]
+
+    )DOC");
+    AddAttr<std::vector<int>>("sections",
+                              "(vector<int>) "
+                              "the length of each output along the "
+                              "specified axis.")
+        .SetDefault(std::vector<int>{});
+    AddAttr<int>("num",
+                 "(int, default 0)"
+                 "Number of sub-tensors. This must evenly divide "
+                 "Input.dims()[axis]")
+        .SetDefault(0);
+    AddAttr<int>("axis",
+                 "(int, default 0) "
+                 "The axis which the input will be splited on.")
+        .SetDefault(0);
+  }
+};
+
+class SplitGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto op = new framework::OpDesc();
+    op->SetType("concat");
+    op->SetInput("X", OutputGrad("Out"));
+    op->SetOutput("Out", InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+USE_CPU_ONLY_OP(concat);
+
+REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker);
+REGISTER_OP_CPU_KERNEL(split,
+                       ops::SplitOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/split_op.cu.cc b/paddle/fluid/operators/split_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..279691c759e988ea26a62e1263198d2a9d878cf9
--- /dev/null
+++ b/paddle/fluid/operators/split_op.cu.cc
@@ -0,0 +1,18 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/split_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    split, ops::SplitOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e78218f2fb108dac5bce717e03ce0aba1ed88195
--- /dev/null
+++ b/paddle/fluid/operators/split_op.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SplitOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto in_stride = framework::stride(in->dims());
+    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
+    const size_t n = outs.size();
+    size_t input_offset = 0;
+    for (size_t i = 0; i < n; i++) {
+      auto& out = outs[i];
+      out->mutable_data<T>(ctx.GetPlace());
+      size_t axis_dim = out->dims()[axis];
+      auto out_stride = framework::stride(out->dims());
+      StridedMemcpy<T>(ctx.device_context(), in->data<T>() + input_offset,
+                       in_stride, out->dims(), out_stride, out->data<T>());
+      input_offset += axis_dim * in_stride[axis];
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..113ce2ce109778a355130aaf686261c1f71c0980
--- /dev/null
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/split_selected_rows_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitSelectedRowsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input SelectedRows.");
+    AddOutput("Out", "The outputs of input SelectedRows.").AsDuplicable();
+    AddAttr<std::vector<int>>("height_sections",
+                              "Height for each output SelectedRows.")
+        .SetDefault(std::vector<int>({}));
+
+    AddComment(R"DOC(
+Split a SelectedRows with a specified rows section.
+height_sections is only needed when need to split the dims of the original tensor.
+
+Example:
+  Input:
+    X.rows = {7, 5}
+    X.height = 12
+  Attr:
+    height_sections = {4, 8}
+  Out:
+    out0.rows = {}
+    out0.height = 4
+
+    out1.rows = {5, 7}
+    out2.height = 8
+
+)DOC");
+  }
+};
+
+class SplitSelectedRowsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "SplitSelectedRowsOp must has input X.");
+    PADDLE_ENFORCE(ctx->HasOutputs("Out"),
+                   "SplitSelectedRowsOp must has output Out.");
+
+    std::vector<int> height_sections =
+        ctx->Attrs().Get<std::vector<int>>("height_sections");
+    int64_t n = ctx->Outputs("Out").size();
+
+    std::vector<framework::DDim> outs_dims;
+    outs_dims.reserve(n);
+
+    // make output dims
+    for (int64_t i = 0; i < n; ++i) {
+      auto dims = ctx->GetInputDim("X");
+      if (height_sections.size()) {
+        PADDLE_ENFORCE_EQ(
+            height_sections.size(), static_cast<size_t>(n),
+            "The size of height section should be the same with height"
+            " section size.");
+        dims[0] = height_sections[i];
+      }
+      outs_dims.push_back(dims);
+    }
+    ctx->SetOutputsDim("Out", outs_dims);
+  }
+};
+
+class SplitSelectedRowsGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("sum");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(split_selected_rows, ops::SplitSelectedRowsOp,
+                  ops::SplitSelectedRowsOpMaker,
+                  ops::SplitSelectedRowsGradMaker);
+REGISTER_OP_CPU_KERNEL(
+    split_selected_rows,
+    ops::SplitSelectedRowsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/split_selected_rows_op.cu b/paddle/fluid/operators/split_selected_rows_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0bbf1ecfaefeddd426b1055d93ce39a138abec28
--- /dev/null
+++ b/paddle/fluid/operators/split_selected_rows_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/split_selected_rows_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    split_selected_rows,
+    ops::SplitSelectedRowsOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..527264bd675520a98b442380e2d1ec259964e92e
--- /dev/null
+++ b/paddle/fluid/operators/split_selected_rows_op.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+
+static int FindOutIdx(int row, const std::vector<int>& height_sections) {
+  int offset = 0;
+  for (size_t i = 0; i < height_sections.size(); ++i) {
+    if (row >= offset && row < (offset + height_sections[i])) {
+      return i;
+    }
+    offset += height_sections[i];
+  }
+  return -1;
+}
+
+template <typename DeviceContext, typename T>
+class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::SelectedRows>("X");
+    auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
+    auto height_sections = ctx.Attr<std::vector<int>>("height_sections");
+
+    auto x_rows = x->rows();
+    std::vector<std::vector<int>> outs_rows_idx;
+    outs_rows_idx.resize(outs.size());
+
+    auto row_numel = x->value().numel() / x->value().dims()[0];
+    auto src = x->value().data<T>();
+
+    for (size_t i = 0; i < x_rows.size(); ++i) {
+      int out_idx = FindOutIdx(x_rows[i], height_sections);
+      outs_rows_idx[out_idx].push_back(i);
+    }
+    auto place = ctx.GetPlace();
+
+    for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
+      auto rows_idx = outs_rows_idx[i];
+      if (rows_idx.size() > 0) {
+        auto dims = x->GetCompleteDims();
+        dims[0] = rows_idx.size();
+        outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
+        for (auto idx : rows_idx) {
+          outs[i]->mutable_rows()->push_back(x_rows[idx]);
+        }
+        auto dst = outs[i]->mutable_value()->mutable_data<T>(ctx.GetPlace());
+        for (size_t j = 0; j < rows_idx.size(); j++) {
+          if (platform::is_cpu_place(place)) {
+            memory::Copy(platform::CPUPlace(), dst + j * row_numel,
+                         platform::CPUPlace(), src + rows_idx[j] * row_numel,
+                         sizeof(T) * row_numel);
+          } else {
+#ifdef PADDLE_WITH_CUDA
+            auto stream = ctx.cuda_device_context().stream();
+            memory::Copy(platform::CUDAPlace(), dst + j * row_numel,
+                         platform::CUDAPlace(), src + rows_idx[j] * row_numel,
+                         sizeof(T) * row_numel, stream);
+#else
+            PADDLE_THROW("Paddle is not compiled with GPU");
+#endif
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e6755b12000463b111fe65dbdf2c140a060d968b
--- /dev/null
+++ b/paddle/fluid/operators/spp_op.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/spp_op.h"
+namespace paddle {
+namespace operators {
+
+class SppOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SppOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of spp operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of spp operator."
+              "N * M."
+              "M = C * H * W");
+    AddAttr<int>("pyramid_height", "(int), multi level pooling");
+    AddAttr<std::string>(
+        "pooling_type",
+        "(string), pooling type, can be \"max\" for max-pooling "
+        "and \"avg\" for average-pooling.")
+        .InEnum({"max", "avg"});
+    AddComment(R"DOC(
+        "With spatial pyramid pooling, the input image can
+        be of any sizes. This not only allows arbitrary aspect
+        ratios, but also allows arbitrary scales. We can resize
+        the input image to any scale (e.g., min(w, h)=180, 224,
+        ...) and apply the same deep network. When the
+        input image is at different scales, the network (with
+        the same filter sizes) will extract features at different
+        scales. The scales play important roles in traditional
+        methods.
+        Input shape: $(N, C_{in}, H_{in}, W_{in})$
+        Output shape: $(H_{out}, W_{out})$
+        Where
+          $$
+            H_{out} = N \\
+            W_{out} = (((4^pyramid_height) - 1) / (4 - 1))$ * C_{in}
+          $$
+        paper https://arxiv.org/pdf/1406.4729v4.pdf
+        )DOC");
+  }
+};
+
+class SppOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SppOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SppOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    int pyramid_height = ctx->Attrs().Get<int>("pyramid_height");
+    PADDLE_ENFORCE(in_x_dims.size() == 4,
+                   "Spping intput must be of 4-dimensional.");
+    int outlen = ((std::pow(4, pyramid_height) - 1) / (4 - 1)) * in_x_dims[1];
+    std::vector<int64_t> output_shape({in_x_dims[0], outlen});
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class SppOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(spp, ops::SppOp, ops::SppOpMaker, spp_grad, ops::SppOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    spp, ops::SppKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SppKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    spp_grad, ops::SppGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SppGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/spp_op.cu.cc b/paddle/fluid/operators/spp_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cad2ca5ef8e16ee6ea6943f61da33367099e0937
--- /dev/null
+++ b/paddle/fluid/operators/spp_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/spp_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    spp, ops::SppKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SppKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    spp_grad, ops::SppGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SppGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1da1f805807fc648b3a54de91842f163b356435b
--- /dev/null
+++ b/paddle/fluid/operators/spp_op.h
@@ -0,0 +1,161 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/pooling.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class SppKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    int pyramid_height = context.template Attr<int>("pyramid_height");
+    std::string pooling_type =
+        context.template Attr<std::string>("pooling_type");
+    out->mutable_data<T>(context.GetPlace());
+    auto out_stride = framework::stride(out->dims());
+    int input_h = in_x->dims()[2];
+    int input_w = in_x->dims()[3];
+    size_t output_offset = 0;
+    for (int p = 0; p < pyramid_height; ++p) {
+      int bins = std::pow(2, p);
+      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
+      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
+      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
+      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
+      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
+      std::vector<int> strides({kernel_size_h, kernel_size_w});
+      std::vector<int> paddings({padding_h, padding_w});
+      // pooling output shape
+      framework::Tensor out_level;
+      std::vector<int64_t> output_shape_vec(
+          {in_x->dims()[0], in_x->dims()[1], bins, bins});
+      framework::DDim output_shape(framework::make_ddim(output_shape_vec));
+      out_level.mutable_data<T>(output_shape, context.GetPlace());
+      // pooling
+      if (pooling_type == "max") {
+        math::Pool2dFunctor<DeviceContext, math::MaxPool<T>, T> pool_forward;
+        math::MaxPool<T> max_process;
+        pool_forward(context.template device_context<DeviceContext>(), *in_x,
+                     kernel_size, strides, paddings, max_process, &out_level);
+      } else if (pooling_type == "avg") {
+        math::Pool2dFunctor<DeviceContext, math::AvgPool<T>, T> pool_forward;
+        math::AvgPool<T> avg_process;
+        pool_forward(context.template device_context<DeviceContext>(), *in_x,
+                     kernel_size, strides, paddings, avg_process, &out_level);
+      }
+      // flatten pooling output shape
+      int output_flatten_w = in_x->dims()[1] * bins * bins;
+      std::vector<int64_t> output_flatten_shape_vec(
+          {in_x->dims()[0], output_flatten_w});
+      framework::DDim output_flatten_shape(
+          framework::make_ddim(output_flatten_shape_vec));
+      out_level.Resize(output_flatten_shape);
+      // concat
+      auto out_level_stride = framework::stride(out_level.dims());
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out_level.data<T>(), out_level_stride, out_level.dims(),
+                       out_stride, out->data<T>() + output_offset);
+      output_offset += out_level.dims()[1] * out_level_stride[1];
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class SppGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    int pyramid_height = context.template Attr<int>("pyramid_height");
+    std::string pooling_type =
+        context.template Attr<std::string>("pooling_type");
+    auto& device_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    in_x_grad->mutable_data<T>(context.GetPlace());
+    zero(device_ctx, in_x_grad, static_cast<T>(0));
+    auto out_stride = framework::stride(out->dims());
+    int input_h = in_x->dims()[2];
+    int input_w = in_x->dims()[3];
+    size_t out_offset = 0;
+    for (int p = 0; p < pyramid_height; ++p) {
+      int bins = std::pow(2, p);
+      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
+      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
+      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
+      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
+      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
+      std::vector<int> strides({kernel_size_h, kernel_size_w});
+      std::vector<int> paddings({padding_h, padding_w});
+      // split out and outgrad  ...  to flatten
+      framework::Tensor out_level;
+      framework::Tensor outgrad_level;
+      int out_flatten_w = in_x->dims()[1] * bins * bins;
+      std::vector<int64_t> out_flatten_shape_vec(
+          {in_x->dims()[0], out_flatten_w});
+      framework::DDim out_flatten_shape(
+          framework::make_ddim(out_flatten_shape_vec));
+      out_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
+      outgrad_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
+      auto flatten_stride = framework::stride(out_level.dims());
+      // memcpy
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out->data<T>() + out_offset, out_stride,
+                       out_level.dims(), flatten_stride, out_level.data<T>());
+
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out_grad->data<T>() + out_offset, out_stride,
+                       outgrad_level.dims(), flatten_stride,
+                       outgrad_level.data<T>());
+      out_offset += out_level.dims()[1] * out_stride[1];
+      // flatten backward to nchw
+
+      std::vector<int64_t> out_shape_vec({in_x->dims()[0], in_x->dims()[1]});
+      out_shape_vec.push_back(
+          (input_h - kernel_size_h + 2 * padding_h) / kernel_size_h + 1);
+      out_shape_vec.push_back(
+          (input_w - kernel_size_w + 2 * padding_w) / kernel_size_w + 1);
+      framework::DDim out_shape(framework::make_ddim(out_shape_vec));
+      out_level.ShareDataWith(out_level);
+      out_level.Resize(out_shape);
+      outgrad_level.ShareDataWith(outgrad_level);
+      outgrad_level.Resize(out_shape);
+      // pooling backward
+      if (pooling_type == "max") {
+        math::MaxPool2dGradFunctor<DeviceContext, T> pool2d_backward;
+        pool2d_backward(context.template device_context<DeviceContext>(), *in_x,
+                        *&out_level, *&outgrad_level, kernel_size, strides,
+                        paddings, in_x_grad);
+      } else if (pooling_type == "avg") {
+        math::Pool2dGradFunctor<DeviceContext, math::AvgPoolGrad<T>, T>
+            pool_backward;
+        math::AvgPoolGrad<T> avg_process;
+        pool_backward(context.template device_context<DeviceContext>(), *in_x,
+                      *&out_level, *&outgrad_level, kernel_size, strides,
+                      paddings, avg_process, in_x_grad);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/squared_l2_distance_op.cc b/paddle/fluid/operators/squared_l2_distance_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c1d0c2c7f392cae3dc30611a0f077c1af7b68cbe
--- /dev/null
+++ b/paddle/fluid/operators/squared_l2_distance_op.cc
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/squared_l2_distance_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SquaredL2DistanceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SquaredL2DistanceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of SquaredL2DistanceOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("sub_result"),
+        "Output(sub_result) of SquaredL2DistanceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SquaredL2DistanceOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_EQ(framework::arity(x_dims), framework::arity(y_dims),
+                      "Tensor rank of both SquaredL2DistanceOp's "
+                      "inputs must be same.");
+
+    int rank = framework::arity(x_dims);
+    PADDLE_ENFORCE_GE(rank, 2, "Tensor rank should be at least equal to 2.");
+    PADDLE_ENFORCE_EQ(product(x_dims) / x_dims[0], product(y_dims) / y_dims[0],
+                      "Product of dimensions expcet the first dimension of "
+                      "input and target must be equal.");
+    PADDLE_ENFORCE(y_dims[0] == 1 || y_dims[0] == x_dims[0],
+                   "First dimension of target must be equal to input "
+                   "or to 1.");
+
+    ctx->SetOutputDim("sub_result", {x_dims[0], product(x_dims) / x_dims[0]});
+    ctx->SetOutputDim("Out", {x_dims[0], 1});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SquaredL2DistanceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input of SquaredL2DistanceOp.");
+    AddInput("Y", "(Tensor) Target of SquaredL2DistanceOp.");
+    AddOutput("sub_result",
+              "(Tensor) Buffering subtraction result which "
+              "will be reused in backward.")
+        .AsIntermediate();
+    AddOutput("Out", "(Tensor) Squared l2 distance between input and target.");
+    AddComment(R"DOC(
+SquaredL2Distance operator
+
+This operator will cacluate the squared L2 distance for the input and 
+the target. Number of distance value will be equal to the first dimension 
+of input. First dimension of the target could be equal to the input or to 1. 
+If the first dimension of target is 1, the operator will broadcast target's 
+first dimension to input's first dimension. During backward propagation, 
+the user can decide whether to calculate the gradient of the input or 
+the target or both.
+
+Both the input X and Y can carry the LoD (Level of Details) information. 
+However, the output only shares the LoD information with input X.
+    )DOC");
+  }
+};
+
+class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Gradient of Out should not be null");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(out_dims[0], x_dims[0],
+                      "First dimension of output gradient and "
+                      "input value must be equal.");
+    PADDLE_ENFORCE_EQ(out_dims[1], 1,
+                      "Second dimension of output gradient "
+                      "must be 1.");
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) ctx->SetOutputDim(x_grad_name, x_dims);
+    if (ctx->HasOutput(y_grad_name)) ctx->SetOutputDim(y_grad_name, y_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(squared_l2_distance, ops::SquaredL2DistanceOp,
+            ops::SquaredL2DistanceOpMaker, squared_l2_distance_grad,
+            ops::SquaredL2DistanceGradOp);
+REGISTER_OP_CPU_KERNEL(
+    squared_l2_distance,
+    ops::SquaredL2DistanceKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(squared_l2_distance_grad,
+                       ops::SquaredL2DistanceGradKernel<
+                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/squared_l2_distance_op.cu b/paddle/fluid/operators/squared_l2_distance_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..959e7afac99bd2565890cb6a296bc908f250a16c
--- /dev/null
+++ b/paddle/fluid/operators/squared_l2_distance_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/fluid/operators/squared_l2_distance_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    squared_l2_distance,
+    ops::SquaredL2DistanceKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(squared_l2_distance_grad,
+                        ops::SquaredL2DistanceGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/squared_l2_distance_op.h b/paddle/fluid/operators/squared_l2_distance_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..aab241247e5e92f43a997f3d29c8e7d7d44d7711
--- /dev/null
+++ b/paddle/fluid/operators/squared_l2_distance_op.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class SquaredL2DistanceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto* in1 = context.Input<Tensor>("Y");
+    auto* out0 = context.Output<Tensor>("sub_result");
+    auto* out1 = context.Output<Tensor>("Out");
+
+    auto in0_dims = in0->dims();
+    auto in1_dims = in1->dims();
+
+    int cols = in0->numel() / in0_dims[0];
+    // reduce dimensions except the first
+    auto x =
+        EigenMatrix<T>::From(*in0, framework::make_ddim({in0_dims[0], cols}));
+    auto y =
+        EigenMatrix<T>::From(*in1, framework::make_ddim({in1_dims[0], cols}));
+
+    out0->mutable_data<T>(context.GetPlace());
+    out1->mutable_data<T>(context.GetPlace());
+    auto sub_result = EigenMatrix<T>::From(*out0);
+    auto z = EigenVector<T>::Flatten(*out1);
+
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    auto x_dims = x.dimensions();
+    auto y_dims = y.dimensions();
+    // buffer the substraction result
+    if (y_dims[0] == 1 && x_dims[0] > y_dims[0]) {
+      sub_result.device(place) =
+          x -
+          y.broadcast(Eigen::array<int, 2>({{static_cast<int>(x_dims[0]), 1}}));
+    } else {
+      sub_result.device(place) = x - y;
+    }
+    auto sub_res_pow2 = sub_result * sub_result;
+    z.device(place) = sub_res_pow2.sum(Eigen::array<int, 1>({{1}}));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("sub_result");
+    auto* in1 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x_g = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* y_g = context.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto sub_result = EigenMatrix<T>::From(*in0);
+    auto out_grad = EigenMatrix<T>::From(*in1);
+
+    auto x_dims = x_g->dims();
+    auto y_dims = y_g->dims();
+
+    int cols = x_g->numel() / x_dims[0];
+    // calculate gradient
+    auto grad_mat = 2 *
+                    (out_grad.broadcast(Eigen::array<int, 2>({{1, cols}}))) *
+                    sub_result;
+
+    // propagate back to input
+    auto& eigen_place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    if (x_g) {
+      x_g->mutable_data<T>(context.GetPlace());
+      // eigen matrix
+      auto x_grad =
+          EigenMatrix<T>::From(*x_g, framework::make_ddim({x_dims[0], cols}));
+      // dimensions are same with subResult
+      x_grad.device(eigen_place) = grad_mat;
+    }
+
+    if (y_g) {
+      y_g->mutable_data<T>(context.GetPlace());
+
+      PADDLE_ENFORCE_GE(sub_result.dimensions()[0], y_dims[0],
+                        "First dimension of gradient must be greater or "
+                        "equal than first dimension of target.");
+
+      if (sub_result.dimensions()[0] == y_dims[0]) {
+        auto y_grad =
+            EigenMatrix<T>::From(*y_g, framework::make_ddim({y_dims[0], cols}));
+        y_grad.device(eigen_place) = -1 * grad_mat;
+      } else {
+        auto col_sum_res = -1 * (grad_mat.sum(Eigen::array<int, 1>({{0}})));
+        auto y_grad = EigenVector<T>::Flatten(*y_g);
+        y_grad.device(eigen_place) = col_sum_res;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/squared_l2_norm_op.cc b/paddle/fluid/operators/squared_l2_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a43cc22994b1b15a4acf0fd89b956bf05d3f35c8
--- /dev/null
+++ b/paddle/fluid/operators/squared_l2_norm_op.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/squared_l2_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class SquaredL2NormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
+
+    ctx->SetOutputDim("Out", {1});
+  }
+};
+
+class SquaredL2NormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) should be not null.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+class SquaredL2NormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SquaredL2NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) The input of squared_l2_norm op.");
+    AddOutput("Out", "(Scalar) The output of squared_l2_norm op.");
+    AddComment(R"DOC(
+SquaredL2Norm Operator.
+
+Computes the squared L2 norm of a tensor.
+
+$$Out = \sum_{i} X_{i}^2$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(squared_l2_norm, ops::SquaredL2NormOp, ops::SquaredL2NormOpMaker,
+            squared_l2_norm_grad, ops::SquaredL2NormGradOp);
+REGISTER_OP_CPU_KERNEL(
+    squared_l2_norm,
+    ops::SquaredL2NormKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    squared_l2_norm_grad,
+    ops::SquaredL2NormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/squared_l2_norm_op.cu b/paddle/fluid/operators/squared_l2_norm_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..52f4ab79b2189b269b7d0685dccedc52b627ad8d
--- /dev/null
+++ b/paddle/fluid/operators/squared_l2_norm_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/squared_l2_norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    squared_l2_norm,
+    ops::SquaredL2NormKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    squared_l2_norm_grad,
+    ops::SquaredL2NormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/squared_l2_norm_op.h b/paddle/fluid/operators/squared_l2_norm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..56524636b8f1b063266fda0997e91a703131adff
--- /dev/null
+++ b/paddle/fluid/operators/squared_l2_norm_op.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+// Out = sum(square(X))
+template <typename DeviceContext, typename T>
+class SquaredL2NormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const framework::Tensor *X = context.Input<framework::Tensor>("X");
+    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
+    Out->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto out = framework::EigenScalar<T>::From(*Out);
+    auto *place =
+        context.template device_context<DeviceContext>().eigen_device();
+
+    out.device(*place) = x.square().sum();
+  }
+};
+
+// dX = X
+template <typename DeviceContext, typename T>
+class SquaredL2NormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const framework::Tensor *X = context.Input<framework::Tensor>("X");
+    const framework::Tensor *dOut =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    PADDLE_ENFORCE(dOut->numel() == 1,
+                   "Squared L2 Norm Gradient should be scalar");
+    framework::Tensor *dX =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    dX->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto dout = framework::EigenVector<T>::Flatten(*dOut);
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto *place =
+        context.template device_context<DeviceContext>().eigen_device();
+
+    Eigen::DSizes<int, 1> x_dsize(X->numel());
+    dx.device(*place) = (dout.broadcast(x_dsize) * x) * static_cast<T>(2.0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a99b405e266da48427fa23e9a3e67f2bc54c5a0
--- /dev/null
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/operators/detail/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+// Strided memory copy from src to dst.
+//
+// The src and dst should be both on dev_ctx.GetPlace(), otherwise, there will
+// be a segment fault.
+//
+// The stride of an array (also referred to as increment, pitch or step size) is
+// the number of locations in memory between beginnings of successive array
+// elements
+//
+// For example, for tensor like [1, 3, 300, 300]. If there is no padding, the
+// stride is [270000, 90000, 300, 1].
+//
+// NOTE: When use GPU, the memcpy is async. To sync memcpy, please invoke
+// `dev_ctx.Wait()`.
+template <typename T>
+inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
+                          const framework::DDim& src_stride,
+                          const framework::DDim& dst_dim,
+                          const framework::DDim& dst_stride, T* dst) {
+  using namespace detail;
+  StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
+  boost::apply_visitor(func, dst_dim);
+}
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/strided_memcpy_test.cc b/paddle/fluid/operators/strided_memcpy_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a369941a993c02744cb7de0d6c6c878f56d5e0fe
--- /dev/null
+++ b/paddle/fluid/operators/strided_memcpy_test.cc
@@ -0,0 +1,161 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/strided_memcpy.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/memory.h"
+
+namespace paddle {
+namespace operators {
+
+TEST(StridedMemcpy, CPUCrop) {
+  // clang-format off
+  int src[] = {
+      0, 1, 2, 0, 0,
+      0, 3, 4, 0, 0,
+      0, 0, 0, 0, 0,
+  };
+  // clang-format on
+
+  framework::DDim src_stride({5, 1});
+
+  int dst[4];
+  framework::DDim dst_dim({2, 2});
+  framework::DDim dst_stride({2, 1});
+
+  platform::CPUDeviceContext ctx;
+  StridedMemcpy<int>(ctx, src + 1, src_stride, dst_dim, dst_stride, dst);
+
+  ASSERT_EQ(1, dst[0]);
+  ASSERT_EQ(2, dst[1]);
+  ASSERT_EQ(3, dst[2]);
+  ASSERT_EQ(4, dst[3]);
+}
+
+TEST(StridedMemcpy, CPUConcat) {
+  // clang-format off
+  int src[] = {
+      1, 2,
+      3, 4
+  };
+  // clang-format on
+
+  int dst[8];
+
+  framework::DDim src_stride({2, 1});
+  framework::DDim dst_dim({2, 2});
+  framework::DDim dst_stride({4, 1});
+  platform::CPUDeviceContext ctx;
+
+  StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst);
+  StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst + 2);
+
+  // clang-format off
+  int expect_dst[] = {
+      1, 2, 1, 2,
+      3, 4, 3, 4
+  };
+  // clang-format on
+  for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) {
+    ASSERT_EQ(expect_dst[i], dst[i]);
+  }
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(StridedMemcpy, GPUCrop) {
+  // clang-format off
+  int src[] = {
+      0, 1, 2, 0, 0,
+      0, 3, 4, 0, 0,
+      0, 0, 0, 0, 0,
+  };
+  // clang-format on
+
+  platform::CUDAPlace gpu0(0);
+  platform::CPUPlace cpu;
+
+  platform::CUDADeviceContext ctx(gpu0);
+
+  int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
+  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
+
+  framework::DDim src_stride({5, 1});
+
+  int dst[4];
+  int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
+
+  framework::DDim dst_dim({2, 2});
+  framework::DDim dst_stride({2, 1});
+
+  StridedMemcpy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride,
+                     gpu_dst);
+
+  memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
+  ctx.Wait();
+
+  ASSERT_EQ(1, dst[0]);
+  ASSERT_EQ(2, dst[1]);
+  ASSERT_EQ(3, dst[2]);
+  ASSERT_EQ(4, dst[3]);
+
+  memory::Free(gpu0, gpu_dst);
+  memory::Free(gpu0, gpu_src);
+}
+
+TEST(StridedMemcpy, GPUConcat) {
+  // clang-format off
+  int src[] = {
+      1, 2,
+      3, 4
+  };
+  // clang-format on
+
+  platform::CUDAPlace gpu0(0);
+  platform::CPUPlace cpu;
+  platform::CUDADeviceContext ctx(gpu0);
+
+  int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
+  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
+
+  int dst[8];
+  int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
+
+  framework::DDim src_stride({2, 1});
+  framework::DDim dst_dim({2, 2});
+  framework::DDim dst_stride({4, 1});
+
+  StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
+  StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride,
+                     gpu_dst + 2);
+
+  memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
+  ctx.Wait();
+
+  // clang-format off
+  int expect_dst[] = {
+      1, 2, 1, 2,
+      3, 4, 3, 4
+  };
+  // clang-format on
+  for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) {
+    ASSERT_EQ(expect_dst[i], dst[i]);
+  }
+
+  memory::Free(gpu0, gpu_dst);
+  memory::Free(gpu0, gpu_src);
+}
+
+#endif
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..96f851720aea2ddd643e1e6251fee314e26cbf95
--- /dev/null
+++ b/paddle/fluid/operators/sum_op.cc
@@ -0,0 +1,200 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sum_op.h"
+#include <vector>
+#include "paddle/fluid/framework/var_type_inference.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+class SumOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInputs("X"), "Inputs(X) should not be null");
+
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SumOp should not be null.");
+    if (ctx->IsRuntime() &&
+        ctx->GetOutputsVarType("Out")[0] ==
+            framework::proto::VarDesc::LOD_TENSOR_ARRAY) {
+      return;  // skip runtime infershape when is tensor array;
+    }
+
+    auto x_dims = ctx->GetInputsDim("X");
+    size_t N = x_dims.size();
+    PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
+
+    framework::DDim in_dim({0});
+    for (auto& x_dim : x_dims) {
+      if (framework::product(x_dim) == 0) {
+        continue;
+      }
+      if (framework::product(in_dim) == 0) {
+        in_dim = x_dim;
+      } else {
+        PADDLE_ENFORCE_EQ(in_dim, x_dim, "Input tensors must have same shape");
+      }
+    }
+    ctx->SetOutputDim("Out", in_dim);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto x_vars = ctx.MultiInputVar("X");
+    if (x_vars[0]->IsType<framework::LoDTensor>()) {
+      int dtype = -1;
+      for (auto& x_var : x_vars) {
+        auto& lod_tensor = x_var->Get<framework::LoDTensor>();
+        if (lod_tensor.numel() == 0) {
+          continue;
+        }
+        if (dtype == -1) {
+          dtype = framework::ToDataType(lod_tensor.type());
+        } else {
+          PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(lod_tensor.type()));
+        }
+      }
+      PADDLE_ENFORCE_NE(dtype, -1,
+                        "Sum operator should have at least one tensor");
+
+      return framework::OpKernelType(
+          static_cast<framework::proto::DataType>(dtype), ctx.device_context());
+    } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
+      return framework::OpKernelType(
+          framework::ToDataType(
+              x_vars[0]->Get<framework::SelectedRows>().value().type()),
+          ctx.device_context());
+    } else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
+      for (auto& x_var : x_vars) {
+        auto& array = x_var->Get<framework::LoDTensorArray>();
+        for (auto& each : array) {
+          if (each.numel() != 0) {
+            return framework::OpKernelType(framework::ToDataType(each.type()),
+                                           ctx.device_context());
+          }
+        }
+      }
+      PADDLE_THROW("Cannot find the input data type by all input data");
+    }
+    PADDLE_THROW("Unexpected branch. Input type is %s",
+                 x_vars[0]->Type().name());
+  }
+};
+
+class SumOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SumOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(vector<Tensor>) The input tensors of sum operator.")
+        .AsDuplicable();
+    AddOutput("Out", "(Tensor) The output tensor of sum operator.");
+    AddComment(R"DOC(
+Sum operator.
+
+This operators sums the input tensors. All the inputs can carry the
+LoD (Level of Details) information. However, the output only shares
+the LoD information with the first input.
+)DOC");
+  }
+};
+
+class SumOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto& inputs = op_desc.Input("X");
+    auto var_type = framework::proto::VarDesc::SELECTED_ROWS;
+
+    for (auto& name : op_desc.Input("X")) {
+      VLOG(10) << name << " "
+               << block->FindRecursiveOrCreateVar(name).GetType();
+    }
+
+    bool any_input_is_lod_tensor = std::any_of(
+        inputs.begin(), inputs.end(), [block](const std::string& name) {
+          return block->FindRecursiveOrCreateVar(name).GetType() ==
+                 framework::proto::VarDesc::LOD_TENSOR;
+        });
+
+    auto is_tensor_array = [block](const std::string& name) {
+      return block->FindRecursiveOrCreateVar(name).GetType() ==
+             framework::proto::VarDesc::LOD_TENSOR_ARRAY;
+    };
+
+    bool any_input_is_tensor_array =
+        std::any_of(inputs.begin(), inputs.end(), is_tensor_array);
+    bool all_inputs_are_tensor_array =
+        std::all_of(inputs.begin(), inputs.end(), is_tensor_array);
+
+    if (any_input_is_tensor_array) {
+      if (!all_inputs_are_tensor_array) {
+        std::ostringstream os;
+        for (auto& each : inputs) {
+          os << "    " << each << " type is "
+             << block->FindRecursiveOrCreateVar(each).GetType() << "\n";
+        }
+        PADDLE_ENFORCE(all_inputs_are_tensor_array,
+                       "Not all inputs are tensor array:\n%s", os.str());
+      }
+      var_type = framework::proto::VarDesc::LOD_TENSOR_ARRAY;
+    } else if (any_input_is_lod_tensor) {
+      var_type = framework::proto::VarDesc::LOD_TENSOR;
+    }
+
+    auto out_var_name = op_desc.Output("Out").front();
+    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    out_var.SetType(var_type);
+    auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front()));
+    out_var.SetDataType(in_var.GetDataType());
+  }
+};
+
+class SumGradMaker : public framework::GradOpDescMakerBase {
+ public:
+  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
+
+  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
+    auto x_grads = InputGrad("X", false);
+    std::vector<std::unique_ptr<framework::OpDesc>> grad_ops;
+    grad_ops.reserve(x_grads.size());
+    auto og = OutputGrad("Out");
+    std::transform(x_grads.begin(), x_grads.end(), std::back_inserter(grad_ops),
+                   [&og](const std::string& x_grad) {
+                     auto* grad_op = new framework::OpDesc();
+                     grad_op->SetType("scale");
+                     grad_op->SetInput("X", og);
+                     grad_op->SetOutput("Out", {x_grad});
+                     grad_op->SetAttr("scale", 1.0f);
+                     return std::unique_ptr<framework::OpDesc>(grad_op);
+                   });
+    return grad_ops;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker,
+                  ops::SumOpVarTypeInference);
+REGISTER_OP_CPU_KERNEL(
+    sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SumKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SumKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SumKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8d8f90d7510bae854a0507adaf8998fb7aea3b58
--- /dev/null
+++ b/paddle/fluid/operators/sum_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/sum_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sum, ops::SumKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e1222c6ef723a6321392a5af7fdb558c24df32b
--- /dev/null
+++ b/paddle/fluid/operators/sum_op.h
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using SelectedRows = framework::SelectedRows;
+using LoDTensor = framework::LoDTensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class SumKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto in_vars = context.MultiInputVar("X");
+    int N = in_vars.size();
+    auto out_var = context.OutputVar("Out");
+
+    bool in_place = out_var == in_vars[0];
+
+    if (out_var->IsType<framework::LoDTensor>()) {
+      auto *out = context.Output<LoDTensor>("Out");
+      if (!in_place) {
+        out->mutable_data<T>(context.GetPlace());
+      }
+      auto result = EigenVector<T>::Flatten(*out);
+      if (!in_place) {
+        math::SetConstant<DeviceContext, T> constant_functor;
+        constant_functor(context.template device_context<DeviceContext>(), out,
+                         0.0);
+      }
+
+      math::SelectedRowsAddToTensor<DeviceContext, T> functor;
+      auto &place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      // If in_place, just skip the first tensor
+      for (int i = in_place ? 1 : 0; i < N; i++) {
+        if (in_vars[i]->IsType<framework::LoDTensor>()) {
+          auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
+          if (in_t.numel() == 0) {
+            continue;
+          }
+          auto in = EigenVector<T>::Flatten(in_t);
+          result.device(place) = result + in;
+        } else if (in_vars[i]->IsType<framework::SelectedRows>()) {
+          auto &in_t = in_vars[i]->Get<framework::SelectedRows>();
+          functor(context.template device_context<DeviceContext>(), in_t, out);
+        } else {
+          PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+        }
+      }
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      std::unique_ptr<framework::SelectedRows> in0;
+      if (in_place) {
+        // If is in_place, we store the input[0] to in0
+        auto &in_sel0 = in_vars[0]->Get<SelectedRows>();
+        auto &rows = in_sel0.rows();
+#ifdef PADDLE_WITH_CUDA
+        std::vector<int64_t> rows_in_cpu;
+        rows_in_cpu.reserve(rows.size());
+        for (auto item : rows) {
+          rows_in_cpu.push_back(item);
+        }
+        in0.reset(new framework::SelectedRows(rows_in_cpu, in_sel0.height()));
+#else
+        in0.reset(new framework::SelectedRows(rows, in_sel0.height()));
+#endif
+        in0->mutable_value()->ShareDataWith(in_sel0.value());
+      }
+
+      auto get_selected_row = [&](size_t i) -> const SelectedRows & {
+        if (i == 0 && in0) {
+          return *in0.get();
+        } else {
+          return in_vars[i]->Get<SelectedRows>();
+        }
+      };
+
+      auto *out = context.Output<SelectedRows>("Out");
+      out->mutable_rows()->clear();
+      auto *out_value = out->mutable_value();
+
+      // Runtime InferShape
+      size_t first_dim = 0;
+      for (int i = 0; i < N; i++) {
+        auto &sel_row = get_selected_row(i);
+        first_dim += sel_row.rows().size();
+      }
+      auto in_dim =
+          framework::vectorize(get_selected_row(N - 1).value().dims());
+      in_dim[0] = static_cast<int64_t>(first_dim);
+
+      out_value->Resize(framework::make_ddim(in_dim));
+      out_value->mutable_data<T>(context.GetPlace());
+
+      math::SelectedRowsAddTo<DeviceContext, T> functor;
+
+      int64_t offset = 0;
+      for (int i = 0; i < N; i++) {
+        auto &sel_row = get_selected_row(i);
+
+        PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
+        functor(context.template device_context<DeviceContext>(), sel_row,
+                offset, out);
+        offset += sel_row.value().numel();
+      }
+    } else if (out_var->IsType<framework::LoDTensorArray>()) {
+      auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
+      for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
+        PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensorArray>(),
+                       "Only support all inputs are TensorArray");
+        auto &in_array = in_vars[i]->Get<framework::LoDTensorArray>();
+
+        for (size_t i = 0; i < in_array.size(); ++i) {
+          if (in_array[i].numel() != 0) {
+            if (i >= out_array.size()) {
+              out_array.resize(i + 1);
+            }
+            if (out_array[i].numel() == 0) {
+              framework::Copy(in_array[i], in_array[i].place(),
+                              context.device_context(), &out_array[i]);
+              out_array[i].set_lod(in_array[i].lod());
+            } else {
+              PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
+              auto in = EigenVector<T>::Flatten(in_array[i]);
+              auto result = EigenVector<T>::Flatten(out_array[i]);
+              result.device(*context.template device_context<DeviceContext>()
+                                 .eigen_device()) = result + in;
+            }
+          }
+        }
+      }
+    } else {
+      PADDLE_THROW("Unexpected branch, output variable type is %s",
+                   out_var->Type().name());
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/target_assign_op.cc b/paddle/fluid/operators/target_assign_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..24f1b7252318f8321ef394d321aedb658398e16d
--- /dev/null
+++ b/paddle/fluid/operators/target_assign_op.cc
@@ -0,0 +1,202 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/target_assign_op.h"
+
+namespace paddle {
+namespace operators {
+
+class TargetAssignOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    // checkout inputs
+    PADDLE_ENFORCE(ctx->HasInput("EncodedGTBBox"),
+                   "Input(EncodedGTBBox) of TargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("GTScoreLabel"),
+                   "Input(GTScoreLabel) of TargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("MatchIndices"),
+                   "Input(MatchIndices) of TargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("NegIndices"),
+                   "Input(NegIndices) of TargetAssignOp should not be null");
+
+    // checkout outputs
+    PADDLE_ENFORCE(
+        ctx->HasOutput("PredBBoxLabel"),
+        "Output(PredBBoxLabel) of TargetAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("PredBBoxWeight"),
+        "Output(PredBBoxWeight) of TargetAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("PredScoreLabel"),
+        "Output(PredScoreLabel) of TargetAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("PredScoreWeight"),
+        "Output(PredScoreWeight) of TargetAssignOp should not be null.");
+
+    auto blabel_dims = ctx->GetInputDim("EncodedGTBBox");
+    auto slabel_dims = ctx->GetInputDim("GTScoreLabel");
+    auto mi_dims = ctx->GetInputDim("MatchIndices");
+    auto neg_dims = ctx->GetInputDim("NegIndices");
+
+    PADDLE_ENFORCE_EQ(blabel_dims.size(), 3UL,
+                      "The rank of Input(EncodedGTBBox) must be 3.");
+    PADDLE_ENFORCE_EQ(slabel_dims.size(), 2UL,
+                      "The rank of Input(GTScoreLabel) must be 2.");
+    PADDLE_ENFORCE_EQ(mi_dims.size(), 2UL,
+                      "The rank of Input(MatchIndices) must be 2.");
+    PADDLE_ENFORCE_EQ(neg_dims.size(), 2UL,
+                      "The rank of Input(NegIndices) must be 2.");
+
+    PADDLE_ENFORCE_EQ(blabel_dims[0], slabel_dims[0],
+                      "The 1st dimension (means the total number of "
+                      "ground-truth bounding boxes) of Input(EncodedGTBBox) "
+                      "and Input(GTScoreLabel) must be the same.");
+    PADDLE_ENFORCE_EQ(blabel_dims[1], mi_dims[1],
+                      "The 2nd dimension (means the number of priod boxes) "
+                      "of Input(EncodedGTBBox) and "
+                      "Input(MatchIndices) must be the same.");
+    PADDLE_ENFORCE_EQ(blabel_dims[2], 4,
+                      "The 3rd dimension of Input(EncodedGTBBox) must be 4.");
+
+    auto n = mi_dims[0];
+    auto np = mi_dims[1];
+    ctx->SetOutputDim("PredBBoxLabel", {n, np, 4});
+    ctx->SetOutputDim("PredBBoxWeight", {n, np, 1});
+    ctx->SetOutputDim("PredScoreLabel", {n, np, 1});
+    ctx->SetOutputDim("PredScoreWeight", {n, np, 1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>("EncodedGTBBox")->type()),
+        ctx.device_context());
+  }
+};
+
+class TargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  TargetAssignOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("EncodedGTBBox",
+             "(LoDTensor), The encoded ground-truth bounding boxes with shape "
+             "[Ng, Np, 4], where Ng is the total number of ground-truth boxes "
+             "in this mini-batch, Np the number of predictions, 4 is the "
+             "number of coordinate in [xmin, ymin, xmax, ymax] layout.");
+    AddInput("GTScoreLabel",
+             "(LoDTensor, default LoDTensor<int>),  The input ground-truth "
+             "labels with shape [Ng, 1], where the Ng is the same as it in "
+             "the input of EncodedGTBBox.");
+    AddInput("MatchIndices",
+             "(Tensor, default Tensor<int>), The input matched indices "
+             "with shape [N, Np], where N is the batch size, Np is the same "
+             "as it in the input of EncodedGTBBox. If MatchIndices[i][j] "
+             "is -1, the j-th prior box is not matched to any ground-truh "
+             "box in i-th instance.");
+    AddInput("NegIndices",
+             "(LoDTensor, default LoDTensor<int>), The input negative example "
+             "indices with shape [Neg, 1], where is the total number of "
+             "negative example indices.");
+    AddAttr<int>("background_label",
+                 "(int, default 0), Label index of background class.")
+        .SetDefault(0);
+    AddOutput("PredBBoxLabel",
+              "(Tensor), The output encoded ground-truth labels "
+              "with shape [N, Np, 4], N is the batch size and Np, 4 is the "
+              "same as they in input of EncodedGTBBox. If MatchIndices[i][j] "
+              "is -1, the PredBBoxLabel[i][j][:] is the encoded ground-truth "
+              "box for background_label in i-th instance.");
+    AddOutput("PredBBoxWeight",
+              "(Tensor), The weight for PredBBoxLabel with the shape "
+              "of [N, Np, 1]");
+    AddOutput("PredScoreLabel",
+              "(Tensor, default Tensor<int>), The output score labels for "
+              "each predictions with shape [N, Np, 1]. If MatchIndices[i][j] "
+              "is -1, PredScoreLabel[i][j] = background_label.");
+    AddOutput("PredScoreWeight",
+              "(Tensor), The weight for PredScoreLabel with the shape "
+              "of [N, Np, 1]");
+    AddComment(R"DOC(
+This operator is, for given the encoded boxes between prior boxes and
+ground-truth boxes and ground-truth class labels, to assign classification
+and regression targets to each prior box as well as weights to each
+prior box. The weights is used to specify which prior box would not contribute
+to training loss.
+
+For each instance, the output `PredBBoxLabel`, `PredBBoxWeight`,
+`PredScoreLabel` and `PredScoreWeight` are assigned based on `MatchIndices`.
+Assumed that the row offset for each instance in `EncodedGTBBox` is called lod,
+this operato assigns classification/regression targets by performing the
+following steps:
+
+1. Assigning all outpts based on `MatchIndices`:
+
+If id = MatchIndices[i][j] > 0,
+
+    PredBBoxLabel[i][j] = EncodedGTBBox[lod[i] + id][j]
+    PredBBoxWeight[i][j] = 1.
+    PredScoreLabel[i][j] = GTScoreLabel[lod[i] + id]
+    PredScoreWeight[i][j] = 1.
+
+Otherwise, 
+
+    PredBBoxLabel[j][j] = [0., 0., 0., 0.]
+    PredBBoxWeight[i][j] = 0.
+    PredScoreLabel[i][j] = background_label
+    PredScoreWeight[i][j] = 0.
+
+2. Assigning PredScoreWeight based on `NegIndices`:
+
+Assumed that the row offset for each instance in `NegIndices` is caleed neg_lod,
+for i-th instance and all ids of NegIndices in this instance:
+
+    PredScoreLabel[i][id] = background_label
+    PredScoreWeight[i][id] = 1.0
+
+    )DOC");
+  }
+};
+
+template <typename T>
+struct NegTargetAssignFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx, const int* neg_indices,
+                  const size_t* lod, const int num, const int num_prior_box,
+                  const int background_label, int* out_label, T* out_label_wt) {
+    for (int i = 0; i < num; ++i) {
+      for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
+        int id = neg_indices[j];
+        out_label[i * num_prior_box + id] = background_label;
+        out_label_wt[i * num_prior_box + id] = static_cast<T>(1.0);
+      }
+    }
+  }
+};
+
+template struct NegTargetAssignFunctor<platform::CPUDeviceContext, float>;
+template struct NegTargetAssignFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(target_assign, ops::TargetAssignOp,
+                             ops::TargetAssignOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    target_assign,
+    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/target_assign_op.cu b/paddle/fluid/operators/target_assign_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5c012d27ad82eb62d9981c8c73ef5b8cc03adc47
--- /dev/null
+++ b/paddle/fluid/operators/target_assign_op.cu
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/target_assign_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void NegTargetAssignKernel(const int* neg_indices, const size_t* lod,
+                                      const int num, const int num_prior_box,
+                                      const int background_label,
+                                      int* out_label, T* out_label_wt) {
+  int bidx = blockIdx.x;
+  int st = lod[bidx];
+  int ed = lod[bidx + 1];
+
+  int row_start = bidx * num_prior_box;
+  for (int i = st + threadIdx.x; i < ed; i += blockDim.x) {
+    int id = row_start + neg_indices[i];
+    out_label[id] = background_label;
+    out_label_wt[id] = 1.;
+  }
+}
+
+template <typename T>
+struct NegTargetAssignFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const int* neg_indices, const size_t* lod, const int num,
+                  const int num_prior_box, const int background_label,
+                  int* out_label, T* out_label_wt) {
+    const int block_size = 256;
+    const int grid_size = num;
+    NegTargetAssignKernel<T><<<grid_size, block_size, 0, ctx.stream()>>>(
+        neg_indices, lod, num, num_prior_box, background_label, out_label,
+        out_label_wt);
+  }
+};
+
+template struct NegTargetAssignFunctor<platform::CUDADeviceContext, float>;
+template struct NegTargetAssignFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    target_assign,
+    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/target_assign_op.h b/paddle/fluid/operators/target_assign_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..876111523af51ce67e804d6646f404c45c00af12
--- /dev/null
+++ b/paddle/fluid/operators/target_assign_op.h
@@ -0,0 +1,160 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct TargetAssignFunctor {
+  const T* gt_box_;
+  const int* gt_label_;
+  const int* match_indices_;
+  const size_t* lod_;
+  const int background_label_;
+  const int64_t num_;
+  const int64_t num_prior_box_;
+
+  T* out_box_;
+  T* out_box_wt_;
+  int* out_label_;
+  T* out_label_wt_;
+
+  TargetAssignFunctor(const T* gt_box, const int* gt_label,
+                      const int* match_indices, const size_t* lod,
+                      const int background_label, const int64_t num,
+                      const int64_t np, T* out_box, T* out_box_wt,
+                      int* out_label, T* out_label_wt)
+      : gt_box_(gt_box),
+        gt_label_(gt_label),
+        match_indices_(match_indices),
+        lod_(lod),
+        background_label_(background_label),
+        num_(num),
+        num_prior_box_(np),
+        out_box_(out_box),
+        out_box_wt_(out_box_wt),
+        out_label_(out_label),
+        out_label_wt_(out_label_wt) {}
+
+  HOSTDEVICE void operator()(size_t i) const {
+    int row = i / num_prior_box_;
+    int col = i - row * num_prior_box_;
+
+    size_t row_off = lod_[row];
+    int offset = row * num_prior_box_ + col;
+
+    int id = match_indices_[offset];
+    T* obox = out_box_ + offset * 4;
+    int* olabel = out_label_ + offset;
+    T* obox_wt = out_box_wt_ + offset;
+    T* olabel_wt = out_label_wt_ + offset;
+
+    if (id > -1) {
+      const T* gtbox = gt_box_ + ((row_off + id) * num_prior_box_ + col) * 4;
+
+      obox[0] = gtbox[0];
+      obox[1] = gtbox[1];
+      obox[2] = gtbox[2];
+      obox[3] = gtbox[3];
+
+      olabel[0] = gt_label_[row_off + id];
+      obox_wt[0] = static_cast<T>(1.);
+      olabel_wt[0] = static_cast<T>(1.);
+    } else {
+      obox[0] = static_cast<T>(0.);
+      obox[1] = static_cast<T>(0.);
+      obox[2] = static_cast<T>(0.);
+      obox[3] = static_cast<T>(0.);
+
+      olabel[0] = background_label_;
+      obox_wt[0] = static_cast<T>(0.);
+      olabel_wt[0] = static_cast<T>(0.);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+struct NegTargetAssignFunctor {
+  void operator()(const platform::DeviceContext& ctx, const int* neg_indices,
+                  const size_t* lod, const int num, const int num_prior_box,
+                  const int background_label, int* out_label,
+                  T* out_label_wt) const;
+};
+
+template <typename DeviceContext, typename T>
+class TargetAssignKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* enc_gt_box = ctx.Input<framework::LoDTensor>("EncodedGTBBox");
+    auto* gt_label = ctx.Input<framework::LoDTensor>("GTScoreLabel");
+    auto* match_indices = ctx.Input<framework::Tensor>("MatchIndices");
+    auto* neg_indices = ctx.Input<framework::LoDTensor>("NegIndices");
+
+    auto* out_box = ctx.Output<framework::Tensor>("PredBBoxLabel");
+    auto* out_box_wt = ctx.Output<framework::Tensor>("PredBBoxWeight");
+    auto* out_label = ctx.Output<framework::Tensor>("PredScoreLabel");
+    auto* out_label_wt = ctx.Output<framework::Tensor>("PredScoreWeight");
+
+    PADDLE_ENFORCE_EQ(enc_gt_box->lod().size(), 1UL);
+    PADDLE_ENFORCE_EQ(gt_label->lod().size(), 1UL);
+    PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL);
+
+    int background_label = ctx.Attr<int>("background_label");
+
+    const T* box_data = enc_gt_box->data<T>();
+    const int* label_data = gt_label->data<int>();
+    const int* match_idx_data = match_indices->data<int>();
+    const int* neg_idx_data = neg_indices->data<int>();
+
+    T* obox_data = out_box->mutable_data<T>(ctx.GetPlace());
+    T* obox_wt_data = out_box_wt->mutable_data<T>(ctx.GetPlace());
+    int* olabel_data = out_label->mutable_data<int>(ctx.GetPlace());
+    T* olabel_wt_data = out_label_wt->mutable_data<T>(ctx.GetPlace());
+
+    int64_t num = match_indices->dims()[0];
+    int64_t num_prior_box = match_indices->dims()[1];
+
+    auto gt_lod = enc_gt_box->lod().back();
+    auto gt_label_lod = gt_label->lod().back();
+    auto neg_lod = neg_indices->lod().back();
+    for (size_t i = 0; i < gt_lod.size(); ++i) {
+      PADDLE_ENFORCE_EQ(gt_lod.data()[i], gt_label_lod.data()[i]);
+    }
+
+    size_t* gt_lod_data = gt_lod.MutableData(ctx.GetPlace());
+    size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
+
+    TargetAssignFunctor<T> functor(box_data, label_data, match_idx_data,
+                                   gt_lod_data, background_label, num,
+                                   num_prior_box, obox_data, obox_wt_data,
+                                   olabel_data, olabel_wt_data);
+
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    platform::ForRange<DeviceContext> for_range(device_ctx,
+                                                num * num_prior_box);
+    for_range(functor);
+
+    NegTargetAssignFunctor<DeviceContext, T> neg_trg_functor;
+    neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, num, num_prior_box,
+                    background_label, olabel_data, olabel_wt_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/tensor_array_read_write_op.cc b/paddle/fluid/operators/tensor_array_read_write_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..50811fb22491598849216f41a584ae0b68f8f306
--- /dev/null
+++ b/paddle/fluid/operators/tensor_array_read_write_op.cc
@@ -0,0 +1,220 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/array_operator.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+namespace paddle {
+namespace operators {
+
+class WriteToArrayOp : public ArrayOp {
+ public:
+  WriteToArrayOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : ArrayOp(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto *x = scope.FindVar(Input("X"));
+    if (x == nullptr) return;
+    auto &x_tensor = x->Get<framework::LoDTensor>();
+    size_t offset = GetOffset(scope, place);
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
+    if (offset >= out->size()) {
+      VLOG(10) << "Resize " << Output("Out") << " from " << out->size()
+               << " to " << offset + 1;
+      out->resize(offset + 1);
+    }
+    if (x_tensor.memory_size() > 0) {
+      auto *out_tensor = &out->at(offset);
+
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
+
+      Copy(x_tensor, place, dev_ctx, out_tensor);
+      out_tensor->set_lod(x_tensor.lod());
+    } else {
+      VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
+                  "nothing has been written to output array["
+               << offset << "].";
+    }
+  }
+};
+
+class WriteToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  WriteToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) the tensor will be written to tensor array");
+    AddInput(
+        "I",
+        "(Tensor) the subscript index in tensor array. The number of element "
+        "should be 1");
+    AddOutput("Out", "(TensorArray) the tensor array will be written");
+    AddComment(R"DOC(
+WriteToArray Operator.
+
+This operator writes a LoDTensor to a LoDTensor array.
+
+Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The
+equation is
+
+$$A[i] = T$$
+
+)DOC");
+  }
+};
+
+class WriteToArrayInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("I"), "Must set the subscript index");
+    PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1,
+                      "The number of element of subscript index must be 1");
+    if (!context->HasInput("X")) {
+      return;
+    }
+    PADDLE_ENFORCE(context->HasOutput("Out"), NotHasOutError());
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+  }
+
+ protected:
+  virtual const char *NotHasXError() const { return "Must set the lod tensor"; }
+
+  virtual const char *NotHasOutError() const {
+    return "Must set the lod tensor array";
+  }
+};
+
+class WriteToArrayInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto x_name = op_desc.Input("X")[0];
+    auto out_name = op_desc.Output("Out")[0];
+    VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
+    auto &out = block->FindRecursiveOrCreateVar(out_name);
+    out.SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY);
+    auto *x = block->FindVarRecursive(x_name);
+    if (x != nullptr) {
+      out.SetDataType(x->GetDataType());
+    }
+  }
+};
+
+class ReadFromArrayOp : public ArrayOp {
+ public:
+  ReadFromArrayOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : ArrayOp(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto *x = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE(x != nullptr, "X must be set");
+    auto &x_array = x->Get<framework::LoDTensorArray>();
+    auto *out = scope.FindVar(Output("Out"));
+    PADDLE_ENFORCE(out != nullptr, "Out must be set");
+    size_t offset = GetOffset(scope, place);
+    if (offset < x_array.size()) {
+      auto *out_tensor = out->GetMutable<framework::LoDTensor>();
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
+      framework::Copy(x_array[offset], place, dev_ctx, out_tensor);
+      out_tensor->set_lod(x_array[offset].lod());
+    } else {
+      VLOG(10) << "offset " << offset << " >= " << x_array.size();
+    }
+  }
+};
+
+class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReadFromArrayProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(TensorArray) the array will be read from.");
+    AddInput("I",
+             "(Tensor) the subscript index in tensor array. The number of "
+             "element should be 1");
+    AddOutput("Out", "(LoDTensor) the tensor will be read from.");
+    AddComment(R"DOC(
+ReadFromArray Operator.
+
+Read a LoDTensor from a LoDTensor Array.
+
+Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The
+equation is
+
+$$T = A[i]$$
+
+)DOC");
+  }
+};
+
+class ReadFromArrayInferShape : public WriteToArrayInferShape {
+ protected:
+  const char *NotHasXError() const override {
+    return "The input array X must be set";
+  }
+  const char *NotHasOutError() const override {
+    return "The output tensor out must be set";
+  }
+};
+
+class WriteToArrayGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("read_from_array");
+    grad_op->SetInput("I", Input("I"));
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class ReadFromArrayGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("write_to_array");
+    grad_op->SetInput("I", Input("I"));
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(write_to_array, ops::WriteToArrayOp,
+                  ops::WriteToArrayInferShape, ops::WriteToArrayOpProtoMaker,
+                  ops::WriteToArrayGradMaker, ops::WriteToArrayInferVarType);
+REGISTER_OPERATOR(read_from_array, ops::ReadFromArrayOp,
+                  ops::ReadFromArrayInferShape, ops::ReadFromArrayProtoMaker,
+                  ops::ReadFromArrayGradMaker);
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c81ea860d0c9fa5498de2d149e3d05d080ad729f
--- /dev/null
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/top_k_op.h"
+
+namespace paddle {
+namespace operators {
+
+class TopkOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of TopkOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of TopkOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Indices"),
+                   "Output(Indices) of TopkOp should not be null.");
+
+    auto input_dims = ctx->GetInputDim("X");
+    const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
+
+    PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
+    PADDLE_ENFORCE_GE(input_dims.size(), 1, "input must have >= 1d shape");
+    PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k,
+                      "input must have >= k columns");
+
+    framework::DDim dims = input_dims;
+    dims[dims.size() - 1] = k;
+    ctx->SetOutputDim("Out", dims);
+    ctx->SetOutputDim("Indices", dims);
+    ctx->ShareLoD("X", "Out");
+    ctx->ShareLoD("X", "Indices");
+  }
+};
+
+class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  TopkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) The input of Topk op");
+    AddOutput("Out", "(Tensor) The output tensor of Topk op");
+    AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
+    AddComment(R"DOC(
+Top K operator
+
+If the input is a vector (1d tensor), this operator finds the k largest 
+entries in the vector and outputs their values and indices as vectors. 
+Thus values[j] is the j-th largest entry in input, and its index is indices[j].
+
+For matrices, this operator computes the top k entries in each row. )DOC");
+    AddAttr<int>("k",
+                 "(int, default 1) Number of top elements to look for along "
+                 "the last dimension (along each row for matrices).")
+        .SetDefault(1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(top_k, ops::TopkOp, ops::TopkOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(top_k,
+                       ops::TopkKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5390cb5063bcc302f5ff9cfe96bd421b477eeb3f
--- /dev/null
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -0,0 +1,320 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/assert.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+struct Pair {
+  __device__ __forceinline__ Pair() {}
+  __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {}
+
+  __device__ __forceinline__ void set(T value, int64_t id) {
+    v = value;
+    id = id;
+  }
+
+  __device__ __forceinline__ void operator=(const Pair<T>& in) {
+    v = in.v;
+    id = in.id;
+  }
+
+  __device__ __forceinline__ bool operator<(const T value) const {
+    return (v < value);
+  }
+
+  __device__ __forceinline__ bool operator<(const Pair<T>& in) const {
+    return (v < in.v) || ((v == in.v) && (id > in.id));
+  }
+
+  __device__ __forceinline__ bool operator>(const Pair<T>& in) const {
+    return (v > in.v) || ((v == in.v) && (id < in.id));
+  }
+
+  T v;
+  int64_t id;
+};
+
+template <typename T>
+__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
+                                      int beam_size) {
+  for (int k = beam_size - 2; k >= 0; k--) {
+    if (topk[k] < p) {
+      topk[k + 1] = topk[k];
+    } else {
+      topk[k + 1] = p;
+      return;
+    }
+  }
+  topk[0] = p;
+}
+
+template <typename T, int beam_size>
+__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p) {
+  for (int k = beam_size - 2; k >= 0; k--) {
+    if (topk[k] < p) {
+      topk[k + 1] = topk[k];
+    } else {
+      topk[k + 1] = p;
+      return;
+    }
+  }
+  topk[0] = p;
+}
+
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
+                                        int dim, int beam_size) {
+  while (idx < dim) {
+    if (topk[beam_size - 1] < src[idx]) {
+      Pair<T> tmp(src[idx], idx);
+      AddTo<T>(topk, tmp, beam_size);
+    }
+    idx += BlockSize;
+  }
+}
+
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
+                                        int dim, const Pair<T>& max,
+                                        int beam_size) {
+  while (idx < dim) {
+    if (topk[beam_size - 1] < src[idx]) {
+      Pair<T> tmp(src[idx], idx);
+      if (tmp < max) {
+        AddTo<T>(topk, tmp, beam_size);
+      }
+    }
+    idx += BlockSize;
+  }
+}
+
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
+                                        int idx, int dim, int beam_size) {
+  while (idx < dim) {
+    if (topk[beam_size - 1] < val[idx]) {
+      Pair<T> tmp(val[idx], col[idx]);
+      AddTo<T>(topk, tmp, beam_size);
+    }
+    idx += BlockSize;
+  }
+}
+
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
+                                        int idx, int dim, const Pair<T>& max,
+                                        int beam_size) {
+  while (idx < dim) {
+    if (topk[beam_size - 1] < val[idx]) {
+      Pair<T> tmp(val[idx], col[idx]);
+      if (tmp < max) {
+        AddTo<T>(topk, tmp, beam_size);
+      }
+    }
+    idx += BlockSize;
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
+                                              int beam_size, const T* src,
+                                              bool& firstStep, bool& is_empty,
+                                              Pair<T>& max, int dim,
+                                              const int tid) {
+  if (beam > 0) {
+    int length = beam < beam_size ? beam : beam_size;
+    if (firstStep) {
+      firstStep = false;
+      GetTopK<T, BlockSize>(topk, src, tid, dim, length);
+    } else {
+      for (int k = 0; k < MaxLength; k++) {
+        if (k < MaxLength - beam) {
+          topk[k] = topk[k + beam];
+        } else {
+          topk[k].set(-INFINITY, -1);
+        }
+      }
+      if (!is_empty) {
+        GetTopK<T, BlockSize>(topk + MaxLength - beam, src, tid, dim, max,
+                              length);
+      }
+    }
+
+    max = topk[MaxLength - 1];
+    if (max.v == -1) is_empty = true;
+    beam = 0;
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
+                                              int beam_size, const T* val,
+                                              int* col, bool& firstStep,
+                                              bool& is_empty, Pair<T>& max,
+                                              int dim, const int tid) {
+  if (beam > 0) {
+    int length = beam < beam_size ? beam : beam_size;
+    if (firstStep) {
+      firstStep = false;
+      GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
+    } else {
+      for (int k = 0; k < MaxLength; k++) {
+        if (k < MaxLength - beam) {
+          topk[k] = topk[k + beam];
+        } else {
+          topk[k].set(-INFINITY, -1);
+        }
+      }
+      if (!is_empty) {
+        GetTopK<T, BlockSize>(topk + MaxLength - beam, val, col, tid, dim, max,
+                              length);
+      }
+    }
+
+    max = topk[MaxLength - 1];
+    if (max.v == -1) is_empty = true;
+    beam = 0;
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
+                                            Pair<T> topk[], T** topVal,
+                                            int64_t** topIds, int& beam, int& k,
+                                            const int tid, const int warp) {
+  while (true) {
+    __syncthreads();
+    if (tid < BlockSize / 2) {
+      if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) {
+        maxid[tid] = tid + BlockSize / 2;
+      } else {
+        maxid[tid] = tid;
+      }
+    }
+    __syncthreads();
+    for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) {
+      if (tid < stride) {
+        if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) {
+          maxid[tid] = maxid[tid + stride];
+        }
+      }
+      __syncthreads();
+    }
+    __syncthreads();
+
+    if (tid == 0) {
+      **topVal = sh_topk[maxid[0]].v;
+      **topIds = sh_topk[maxid[0]].id;
+      (*topVal)++;
+      (*topIds)++;
+    }
+    if (tid == maxid[0]) beam++;
+    if (--k == 0) break;
+    __syncthreads();
+
+    if (tid == maxid[0]) {
+      if (beam < MaxLength) {
+        sh_topk[tid] = topk[beam];
+      }
+    }
+    if (maxid[0] / 32 == warp) {
+      if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break;
+    }
+  }
+}
+
+/**
+ * Each block compute one sample.
+ * In a block:
+ * 1. every thread get top MaxLength value;
+ * 2. merge to sh_topk, block reduce and get max value;
+ * 3. go to the second setp, until one thread's topk value is null;
+ * 4. go to the first setp, until get the topk value.
+ */
+template <typename T, int MaxLength, int BlockSize>
+__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
+                             const T* src, int lds, int dim, int k) {
+  __shared__ Pair<T> sh_topk[BlockSize];
+  __shared__ int maxid[BlockSize / 2];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+  output += blockIdx.x * output_stride;
+  indices += blockIdx.x * k;
+
+  Pair<T> topk[MaxLength];
+  int beam = MaxLength;
+  Pair<T> max;
+  bool is_empty = false;
+  bool firststep = true;
+
+  for (int k = 0; k < MaxLength; k++) {
+    topk[k].set(-INFINITY, -1);
+  }
+  while (k) {
+    ThreadGetTopK<T, MaxLength, BlockSize>(topk, beam, k,
+                                           src + blockIdx.x * lds, firststep,
+                                           is_empty, max, dim, tid);
+
+    sh_topk[tid] = topk[0];
+    BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
+                                         &indices, beam, k, tid, warp);
+  }
+}
+
+template <typename T>
+class TopkOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* indices = ctx.Output<Tensor>("Indices");
+    size_t k = static_cast<int>(ctx.Attr<int>("k"));
+
+    const T* input_data = input->data<T>();
+
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    // FIXME(typhoonzero): data is always converted to type T?
+    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    size_t input_height = input->dims()[0];
+    size_t input_width = input->dims()[1];
+    if (k > input_width) k = input_width;
+
+    // NOTE: pass lds and dim same to input width.
+    // NOTE: old matrix implementation of stride is different to eigen.
+    // TODO(typhoonzero): refine this kernel.
+    dim3 threads(256, 1);
+    dim3 grid(input_height, 1);
+
+    KeMatrixTopK<T, 5, 256><<<
+        grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(output_data, output->dims()[1],
+                                           indices_data, input_data,
+                                           input_width, input_width, int(k));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>);
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e32b35150070b30c3ccbbb9483c1f24b1d205919
--- /dev/null
+++ b/paddle/fluid/operators/top_k_op.h
@@ -0,0 +1,77 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class TopkKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // Get the top k elements of each row of input tensor
+    // FIXME: only deal with matrix(2d tensor).
+    auto* input = ctx.Input<LoDTensor>("X");
+    auto* output = ctx.Output<LoDTensor>("Out");
+    auto* indices = ctx.Output<LoDTensor>("Indices");
+    // k is determined by Attr
+    const size_t k = static_cast<int>(ctx.Attr<int>("k"));
+
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    auto eg_input = EigenMatrix<T>::From(*input);
+
+    // reshape input to a flattern matrix(like flat_inner_dims)
+    framework::DDim inputdims = input->dims();
+    const size_t row = framework::product(
+        framework::slice_ddim(inputdims, 0, inputdims.size() - 1));
+    const size_t col = inputdims[inputdims.size() - 1];
+    Eigen::DSizes<int, 2> flat2dims(row, col);
+    // NOTE: eigen shape doesn't affect paddle tensor.
+    eg_input.reshape(flat2dims);
+
+    for (size_t i = 0; i < row; i++) {
+      std::vector<std::pair<T, size_t>> vec;
+      for (size_t j = 0; j < col; j++) {
+        vec.push_back(std::pair<T, size_t>(eg_input(i, j), j));
+      }
+
+      std::partial_sort(
+          vec.begin(), vec.begin() + k, vec.end(),
+          [](const std::pair<T, size_t>& l, const std::pair<T, size_t>& r) {
+            return l.first > r.first;
+          });
+      for (size_t j = 0; j < k; j++) {
+        output_data[i * k + j] = vec[j].first;
+        indices_data[i * k + j] = int64_t(vec[j].second);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a3d8acffc269c1abe98d2de39dcf09fbf3d825f3
--- /dev/null
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -0,0 +1,126 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/transpose_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class TransposeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    std::vector<int> axis = ctx->Attrs().Get<std::vector<int>>("axis");
+    size_t x_rank = x_dims.size();
+    size_t axis_size = axis.size();
+
+    PADDLE_ENFORCE_EQ(x_rank, axis_size,
+                      "The input tensor's rank(%d) "
+                      "should be equal to the axis's size(%d)",
+                      x_rank, axis_size);
+
+    std::vector<int> count(axis_size, 0);
+    for (size_t i = 0; i < axis_size; i++) {
+      PADDLE_ENFORCE(
+          axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1,
+          "Each element of Attribute axis should be a unique value "
+          "range from 0 to (dims - 1), "
+          "where the dims is the axis's size");
+    }
+
+    framework::DDim out_dims(x_dims);
+    for (size_t i = 0; i < axis_size; i++) {
+      out_dims[i] = x_dims[axis[i]];
+    }
+    ctx->SetOutputDim("Out", out_dims);
+  }
+};
+
+class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  TransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor, tensors with rank up to 6 are supported.");
+    AddOutput("Out", "(Tensor)The output tensor.");
+    AddAttr<std::vector<int>>(
+        "axis",
+        "(vector<int>) A list of values, and the size of the list should be "
+        "the same with the input tensor rank. This operator permutes the input "
+        "tensor's axes according to the values given.");
+    AddComment(R"DOC(
+Transpose Operator.
+
+The input tensor will be permuted according to the axes given.
+The behavior of this operator is similar to how `numpy.transpose` works.
+
+- suppose the input `X` is a 2-D tensor:
+    $$
+    X = \begin{pmatrix}
+    0 &1 &2 \\
+    3 &4 &5
+    \end{pmatrix}$$
+
+    the given `axes` is: $[1, 0]$, and $Y$ = transpose($X$, axis)
+
+    then the output $Y$ is:
+
+    $$
+    Y = \begin{pmatrix}
+         0 &3 \\
+         1 &4  \\
+         2 &5
+    \end{pmatrix}$$
+
+- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is 
+$[0, 2, 3, 1]$, then shape of the output tensor will be: $(N, H, W, C)$.
+
+)DOC");
+  }
+};
+
+class TransposeOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(transpose, ops::TransposeOp, ops::TransposeOpMaker, transpose_grad,
+            ops::TransposeOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    transpose_grad,
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/transpose_op.cu.cc b/paddle/fluid/operators/transpose_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f8667ab369e1de9c2b74ba902242355d1660d24a
--- /dev/null
+++ b/paddle/fluid/operators/transpose_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/transpose_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    transpose,
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    transpose_grad,
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fb419474ab078efa64523454bbbbb6176a58d40
--- /dev/null
+++ b/paddle/fluid/operators/transpose_op.h
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+inline void TransCompute(const int dim, const DeviceContext& dev_ctx,
+                         const framework::Tensor& in, framework::Tensor* out,
+                         const std::vector<int>& axis) {
+  switch (dim) {
+    case 1:
+      math::Transpose<DeviceContext, T, 1> trans1;
+      trans1(dev_ctx, in, out, axis);
+      break;
+    case 2:
+      math::Transpose<DeviceContext, T, 2> trans2;
+      trans2(dev_ctx, in, out, axis);
+      break;
+    case 3:
+      math::Transpose<DeviceContext, T, 3> trans3;
+      trans3(dev_ctx, in, out, axis);
+      break;
+    case 4:
+      math::Transpose<DeviceContext, T, 4> trans4;
+      trans4(dev_ctx, in, out, axis);
+      break;
+    case 5:
+      math::Transpose<DeviceContext, T, 5> trans5;
+      trans5(dev_ctx, in, out, axis);
+      break;
+    case 6:
+      math::Transpose<DeviceContext, T, 6> trans6;
+      trans6(dev_ctx, in, out, axis);
+      break;
+    default:
+      PADDLE_THROW("Tensors with rank at most 6 are supported");
+  }
+}
+
+template <typename DeviceContext, typename T>
+class TransposeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+
+    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
+    int ndims = axis.size();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    TransCompute<DeviceContext, T>(ndims, dev_ctx, *x, out, axis);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TransposeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    if (!x_grad) return;
+
+    x_grad->mutable_data<T>(context.GetPlace());
+    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
+    std::vector<int> reversed_axis(axis);
+
+    for (size_t i = 0; i < axis.size(); i++) {
+      reversed_axis[axis[i]] = i;
+    }
+
+    int ndims = axis.size();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    TransCompute<DeviceContext, T>(ndims, dev_ctx, *out_grad, x_grad,
+                                   reversed_axis);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b6fea1d4485fd7f88375c96511c85396b707bf1c
--- /dev/null
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename T>
+class CPUUniformRandomKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* tensor = ctx.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(ctx.GetPlace());
+    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::uniform_real_distribution<T> dist(
+        static_cast<T>(ctx.Attr<float>("min")),
+        static_cast<T>(ctx.Attr<float>("max")));
+    int64_t size = tensor->numel();
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(engine);
+    }
+  }
+};
+
+class UniformRandomOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of UniformRandomOp should not be null.");
+
+    PADDLE_ENFORCE(
+        ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
+        "uniform_random's min must less then max");
+    auto& shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    std::vector<int64_t> temp;
+    temp.reserve(shape.size());
+    for (auto dim : shape) {
+      temp.push_back(static_cast<int64_t>(dim));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(temp));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
+        ctx.GetPlace());
+  }
+};
+
+class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  UniformRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "(Tensor) The output tensor of uniform random op");
+    AddComment(R"DOC(
+Uniform random operator.
+
+This operator initializes a tensor with random values sampled from a 
+uniform distribution.
+
+)DOC");
+    AddAttr<std::vector<int>>("shape",
+                              "(vector<int>) The shape of the output tensor");
+    AddAttr<float>("min",
+                   "(float, default -1.0) "
+                   "Minimum value of uniform random")
+        .SetDefault(-1.0f);
+    AddAttr<float>("max",
+                   "(float, default 1.0) "
+                   "Maximun value of uniform random")
+        .SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "(int, default 0) "
+                 "Random seed used for generating samples. "
+                 "0 means use a seed generated by the system.")
+        .SetDefault(0);
+    AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
+        .SetDefault(framework::proto::DataType::FP32);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(uniform_random, paddle::operators::UniformRandomOp,
+                             paddle::operators::UniformRandomOpMaker);
+REGISTER_OP_CPU_KERNEL(uniform_random,
+                       paddle::operators::CPUUniformRandomKernel<float>,
+                       paddle::operators::CPUUniformRandomKernel<double>);
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9afca68e59f8c0eedaf38be4b51343b9cb043f65
--- /dev/null
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct UniformGenerator {
+  T min_, max_;
+  unsigned int seed_;
+
+  __host__ __device__ UniformGenerator(T min, T max, int seed)
+      : min_(min), max_(max), seed_(seed) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n);
+    return dist(rng);
+  }
+};
+
+// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename T>
+class GPUUniformRandomKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    if (seed == 0) {
+      std::random_device rd;
+      seed = rd();
+    }
+    T min = static_cast<T>(context.Attr<float>("min"));
+    T max = static_cast<T>(context.Attr<float>("max"));
+    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    int64_t size = tensor->numel();
+    thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                      thrust::device_ptr<T>(data),
+                      UniformGenerator<T>(min, max, seed));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(uniform_random,
+                        paddle::operators::GPUUniformRandomKernel<float>,
+                        paddle::operators::GPUUniformRandomKernel<double>);
diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2e0b271fed69772a53a776f290d524565df3dc94
--- /dev/null
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/unpool_op.h"
+namespace paddle {
+namespace operators {
+
+class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Unpool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of unpool operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddInput(
+        "Indices",
+        "(Tensor) The input tensor of the indices given out by MaxPool2d. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of unpool operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of feature.");
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "(vector), the unpooling window size(height, width) "
+        "of unpooling operator.");
+    AddAttr<std::vector<int>>("strides",
+                              "(vector, default:{1, 1}), "
+                              "strides (height, width) of unpooling operator.")
+        .SetDefault({1, 1});
+    AddAttr<std::vector<int>>("paddings",
+                              "(vector defalut:{0,0}), "
+                              "paddings (height, width) of unpooling operator.")
+        .SetDefault({0, 0});
+    AddAttr<std::string>(
+        "unpooling_type",
+        "(string), unpooling type, can be \"max\" for max-unpooling ")
+        .InEnum({"max"});
+    AddComment(R"DOC(
+Input shape is: $(N, C_{in}, H_{in}, W_{in})$, Output shape is:
+$(N, C_{out}, H_{out}, W_{out})$, where
+$$
+H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
+W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
+$$
+Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf
+)DOC");
+  }
+};
+
+int OutputSize(int input_size, int ksize, int padding, int stride) {
+  int output_size = (input_size - 1) * stride - 2 * padding + ksize;
+  return output_size;
+}
+
+class UnpoolOp : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Indices"),
+                   "Input(Indices) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of UnpoolOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    auto in_y_dims = ctx->GetInputDim("Indices");
+    std::string unpooling_type =
+        ctx->Attrs().Get<std::string>("unpooling_type");
+    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    PADDLE_ENFORCE(in_x_dims.size() == 4,
+                   "Unpooling intput must be of 4-dimensional.");
+    PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(
+          OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class UnpoolOpGrad : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
+            ops::UnpoolOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    unpool, ops::UnpoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::UnpoolKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    unpool_grad,
+    ops::UnpoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::UnpoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/unpool_op.cu.cc b/paddle/fluid/operators/unpool_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..15d81eb296ba35d9f67426083870c1a83ff66ee5
--- /dev/null
+++ b/paddle/fluid/operators/unpool_op.cu.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/unpool_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    unpool, ops::UnpoolKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::UnpoolKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    unpool_grad,
+    ops::UnpoolGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::UnpoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/unpool_op.h b/paddle/fluid/operators/unpool_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ceed5507391b40491d4a26bbc51e8c861e1bf1c2
--- /dev/null
+++ b/paddle/fluid/operators/unpool_op.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/unpooling.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class UnpoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    auto* out = context.Output<framework::Tensor>("Out");
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    T* output_data = out->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    if (output_data) {
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(dev_ctx, out, static_cast<T>(0));
+    }
+    math::Unpool2dMaxFunctor<DeviceContext, T> unpool2d_max_forward;
+    unpool2d_max_forward(dev_ctx, *in_x, *in_y, out);
+  }
+};
+template <typename DeviceContext, typename T>
+class UnpoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    auto& device_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, in_x_grad, static_cast<T>(0));
+    }
+    math::Unpool2dMaxGradFunctor<DeviceContext, T> unpool2d_max_backward;
+    unpool2d_max_backward(device_ctx, *in_x, *in_y, *out, *out_grad, in_x_grad);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c05fed0b47c3bb3582e4b261ef188146d41820e
--- /dev/null
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/warpctc_op.h"
+
+namespace paddle {
+namespace operators {
+
+class WarpCTCOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) of WarpCTCOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"),
+                   "Input(Label) of WarpCTCOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("WarpCTCGrad"),
+                   "Output(WarpCTCGrad) of WarpCTCOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Loss"),
+                   "Output(Loss) of WarpCTCOp should not be null.");
+
+    auto logits_dims = ctx->GetInputDim("Logits");
+    int sequence_width =
+        static_cast<int>(framework::product(logits_dims) / logits_dims[0]);
+    int blank = ctx->Attrs().Get<int>("blank");
+    PADDLE_ENFORCE((blank >= 0) && (blank < sequence_width),
+                   "The value of Attr(blank) should be in interval [0, %d).",
+                   sequence_width);
+    // TODO(liuyiqun): it is tricky to set the wrong dimension here.
+    ctx->SetOutputDim("Loss", {logits_dims[0], 1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
+        ctx.device_context());
+  }
+};
+
+class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  WarpCTCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Logits",
+             "(LodTensor, default: LoDTensor<float>), the unscaled "
+             "probabilities of variable-length sequences, which is a 2-D "
+             "Tensor with LoD information. It's shape is "
+             "[Lp, num_classes + 1], where Lp is the sum of all input "
+             "sequences' length and num_classes is the true number of classes "
+             "(not including the blank label).");
+    AddInput("Label",
+             "(LodTensor, default: LoDTensor<int>), the ground truth "
+             "of variable-length sequence, which is a 2-D Tensor with LoD "
+             "information. It is of the shape [Lg, 1], where Lg is th sum of "
+             "all labels' length.");
+    AddOutput("WarpCTCGrad",
+              "(Tensor, default: Tensor<float>), a temporary "
+              "output Tensor to store the gradients of warp-ctc, which is "
+              "computed with loss together in one call. It is a 3-D Tensor of "
+              "the shape [max_sequence_length, batch_size, num_classes + 1].")
+        .AsIntermediate();
+    AddOutput("Loss",
+              "(Tensor, default: Tensor<float>), the Connectionist "
+              "Temporal Classification (CTC) loss, which is a 2-D Tensor of "
+              "the shape [batch_size, 1]");
+    AddAttr<int>("blank",
+                 "(int, default: 0), the blank label of Connectionist "
+                 "Temporal Classification (CTC) loss, which is in the "
+                 "half-opened interval [0, num_classes + 1).")
+        .SetDefault(0);
+    AddAttr<bool>("norm_by_times",
+                  "(bool, default: false), whether to "
+                  "normalize the gradients by the number of time-step, "
+                  "which is also the sequence's length.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+An operator integrating the open-source
+[warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in
+[Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin](
+https://arxiv.org/pdf/1512.02595v1.pdf),
+to compute Connectionist Temporal Classification (CTC) loss.
+It can be aliased as softmax with ctc, since a native softmax activation is
+interated to the warp-ctc library, to to normlize values for each row of the
+input tensor.
+
+More detail of CTC loss can be found by refering to
+[Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with
+Recurrent Neural Networks](
+http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf).
+)DOC");
+  }
+};
+
+class WarpCTCGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("WarpCTCGrad"),
+                   "Input(WarpCTCGrad) of WarpCTCGradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
+                   "Output(Logits@GRAD) of WarpCTCGradOp should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("Logits"),
+                      ctx->GetInputDim("Logits"));
+    ctx->ShareLoD("Logits", /*->*/ framework::GradVarName("Logits"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker, warpctc_grad,
+            ops::WarpCTCGradOp);
+REGISTER_OP_CPU_KERNEL(
+    warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    warpctc_grad,
+    ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/warpctc_op.cu.cc b/paddle/fluid/operators/warpctc_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9ee7f970a9a5f7deaab8a98278cf1a7c051cfbd2
--- /dev/null
+++ b/paddle/fluid/operators/warpctc_op.cu.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/warpctc_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    warpctc_grad,
+    ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1de71627ee7e511a03231c11831cf54755a145e
--- /dev/null
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -0,0 +1,229 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence_padding.h"
+#include "paddle/fluid/operators/math/sequence_scale.h"
+#include "paddle/fluid/platform/dynload/warpctc.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext>
+class WarpCTCFunctor {
+ public:
+  /*
+   * \brief Compute the connectionist temporal classification loss,
+   *        and optionally compute the gradient with respect to the inputs.
+   *
+   * If gradient is nullptr, it only computes the ctc loss,
+   * or computes both ctc loss and gradient.
+   *
+   * \param ctx               execution context of this functor
+   * \param input             batch matrix of input probabilities, in
+   *                          max_sequence_length x num_sequences x
+   *                          sequence_width, (row-major) format
+   * \param gradient          batch matrix of gradient, with the same shape as
+   *                          input.
+   * \param cpu_labels        labels always in CPU memory.
+   * \param cpu_label_lengths length of all labels in CPU memory.
+   * \param cpu_input_lengths length of all sequences in CPU memory.
+   * \param sequence_width    number of possible output symbols.
+   * \param num_sequences     number of sequence.
+   * \param blank             blank label used in ctc loss function.
+   * \param cpu_losss         cost of each sequence in CPU memory.
+   */
+  void operator()(const framework::ExecutionContext& ctx, const float* input,
+                  float* gradient, const int* cpu_labels,
+                  const int* cpu_label_lengths, const int* cpu_input_lengths,
+                  const size_t sequence_width, const size_t num_sequences,
+                  const size_t blank, float* cpu_loss) {
+    // Init warp-ctc options
+    init(ctx, blank);
+
+    // Compute the required workspace size.
+    // There is no memory allocated operations within warp-ctc.
+    size_t workspace_bytes = 0;
+    ctcStatus_t status = platform::dynload::get_workspace_size(
+        cpu_label_lengths, cpu_input_lengths, static_cast<int>(sequence_width),
+        static_cast<int>(num_sequences), options_, &workspace_bytes);
+    PADDLE_ENFORCE_EQ(CTC_STATUS_SUCCESS, status,
+                      "warp-ctc [version %d] Error in get_workspace_size: ",
+                      warpctc_version_,
+                      platform::dynload::ctcGetStatusString(status));
+    PADDLE_ENFORCE_GT(workspace_bytes, 0UL,
+                      "Bytes of workspace got by warp-ctc function, "
+                      "get_workspace_size(), should be larger than 0.");
+
+    Tensor workspace;
+    size_t workspace_elements = workspace_bytes / sizeof(float) + 1UL;
+    float* workspace_data = workspace.mutable_data<float>(
+        framework::make_ddim({static_cast<int64_t>(workspace_elements)}),
+        ctx.GetPlace());
+    math::SetConstant<DeviceContext, float>()(
+        ctx.template device_context<DeviceContext>(), &workspace,
+        static_cast<float>(0));
+
+    // compute loss and gradient
+    status = platform::dynload::compute_ctc_loss(
+        input, gradient, cpu_labels, cpu_label_lengths, cpu_input_lengths,
+        static_cast<int>(sequence_width), static_cast<int>(num_sequences),
+        cpu_loss, workspace_data, options_);
+    PADDLE_ENFORCE_EQ(CTC_STATUS_SUCCESS, status,
+                      "warp-ctc [version %d] Error in compute_ctc_loss: ",
+                      warpctc_version_,
+                      platform::dynload::ctcGetStatusString(status));
+  }
+
+ protected:
+  void init(const framework::ExecutionContext& ctx, const size_t blank) {
+    warpctc_version_ = platform::dynload::get_warpctc_version();
+
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+      options_.loc = CTC_GPU;
+      options_.stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                            ctx.device_context())
+                            .stream();
+#else
+      PADDLE_THROW("[warpctc init] GPU is not enabled.");
+#endif
+    } else {
+      options_.loc = CTC_CPU;
+      options_.num_threads = 1;
+    }
+
+    options_.blank_label = blank;
+  }
+
+ private:
+  int warpctc_version_;
+  ctcOptions options_;
+};
+
+template <typename DeviceContext, typename T>
+class WarpCTCKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* logits = ctx.Input<LoDTensor>("Logits");
+    auto* label = ctx.Input<LoDTensor>("Label");
+    auto* warpctc_grad = ctx.Output<Tensor>("WarpCTCGrad");
+    auto* loss = ctx.Output<Tensor>("Loss");
+
+    const size_t level = 0;
+
+    auto logits_lod = framework::ToAbsOffset(logits->lod());
+    auto logits_dims = logits->dims();
+    PADDLE_ENFORCE_EQ(logits_dims[0],
+                      static_cast<int64_t>(logits_lod[level].back()),
+                      "The first dimension of Input(Logits) should be equal to "
+                      "the sum of all sequences' lengths.");
+
+    auto label_lod = framework::ToAbsOffset(label->lod());
+    auto label_dims = label->dims();
+    PADDLE_ENFORCE_EQ(
+        label_dims[0], label->numel(),
+        "The width of each timestep in Input(Label) should be 1.");
+
+    const size_t num_sequences = logits_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(num_sequences, label_lod[level].size() - 1,
+                      "The number of sequences of Input(Logits) should be "
+                      "equal to that of Input(Label).");
+
+    const size_t sequence_width = logits->numel() / logits_dims[0];
+    auto loss_dims =
+        framework::make_ddim({static_cast<int64_t>(num_sequences), 1});
+
+    // warpctc needs sequences data stored in transposed padding format
+    Tensor warpctc_logits;
+    const size_t max_sequence_length =
+        math::MaximumSequenceLength(logits_lod, level);
+    auto warpctc_logits_dims =
+        framework::make_ddim({static_cast<int64_t>(max_sequence_length),
+                              static_cast<int64_t>(num_sequences),
+                              static_cast<int64_t>(sequence_width)});
+    warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace());
+    math::PaddingLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), *logits, warpctc_logits,
+        false);
+    const T* warpctc_logits_data = warpctc_logits.data<T>();
+
+    std::vector<int> warpctc_label_lengths(num_sequences);
+    std::vector<int> warpctc_logits_lengths(num_sequences);
+
+    for (size_t i = 0; i < num_sequences; ++i) {
+      warpctc_label_lengths[i] = label_lod[level][i + 1] - label_lod[level][i];
+      warpctc_logits_lengths[i] =
+          logits_lod[level][i + 1] - logits_lod[level][i];
+    }
+
+    // warpctc computes loss and gradient in one call, gradient data also stored
+    // in batch format
+    T* warpctc_grad_data =
+        warpctc_grad->mutable_data<T>(warpctc_logits.dims(), ctx.GetPlace());
+
+    math::SetConstant<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), warpctc_grad,
+        static_cast<T>(0));
+
+    // warpctc accesses labels in CPU memory
+    Tensor warpctc_label;
+    Copy(*label, platform::CPUPlace(), ctx.device_context(), &warpctc_label);
+    const int* warpctc_label_data = warpctc_label.data<int>();
+    // warpctc stores loss in CPU memory
+    Tensor warpctc_loss;
+    T* warpctc_loss_data =
+        warpctc_loss.mutable_data<T>(loss_dims, platform::CPUPlace());
+
+    const size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
+
+    WarpCTCFunctor<DeviceContext>()(
+        ctx, warpctc_logits_data, warpctc_grad_data, warpctc_label_data,
+        warpctc_label_lengths.data(), warpctc_logits_lengths.data(),
+        sequence_width, num_sequences, blank, warpctc_loss_data);
+
+    // Copy the loss back
+    Copy(warpctc_loss, ctx.GetPlace(), ctx.device_context(), loss);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class WarpCTCGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* warpctc_grad = ctx.Input<Tensor>("WarpCTCGrad");
+    auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
+    const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+
+    logits_grad->mutable_data<T>(ctx.GetPlace());
+    bool norm_by_times = ctx.Attr<bool>("norm_by_times");
+    math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), *logits_grad,
+        *warpctc_grad, norm_by_times);
+
+    const T* loss_grad_data = loss_grad->data<T>();
+    math::ScaleLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), *logits_grad,
+        loss_grad_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d254c572acff52d967e551c377b3b32b05c92973
--- /dev/null
+++ b/paddle/fluid/operators/while_op.cc
@@ -0,0 +1,352 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+
+namespace paddle {
+namespace operators {
+
+using StepScopeVar = std::vector<framework::Scope *>;
+using LoDTensor = framework::LoDTensor;
+
+static constexpr char kStepBlock[] = "sub_block";
+static constexpr char kCondition[] = "Condition";
+static constexpr char kStepScopes[] = "StepScopes";
+static constexpr char kX[] = "X";
+static constexpr char kXGRAD[] = "X@GRAD";
+static constexpr char kOutputs[] = "Out";
+
+class WhileOp : public framework::OperatorBase {
+ public:
+  WhileOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
+    auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
+    PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
+
+    framework::Executor executor(dev_place);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
+    auto *program = block->Program();
+
+    auto step_scopes =
+        scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
+
+    PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
+                   "Condition of while op must in CPU memory.");
+    while (cond.data<bool>()[0]) {
+      auto &current_scope = scope.NewScope();
+      step_scopes->push_back(&current_scope);
+
+      executor.Run(*program, &current_scope, block->ID(),
+                   false /*create_local_scope*/);
+    }
+  }
+};
+
+class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  WhileOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kX,
+             "A set of variables, which are required by operators inside the "
+             "block of While Op.")
+        .AsDuplicable();
+    AddInput(
+        kCondition,
+        "(Bool) An scalar. When it's False, the While Op will be terminated.")
+        .AsDuplicable();
+    AddOutput(kOutputs,
+              "A set of variables, which will be assigned with values "
+              "generated by the operators inside the block of While Op.")
+        .AsDuplicable();
+    AddOutput(kStepScopes,
+              "(StepScopeVar) A vector of local scope, which size equals the "
+              "step number of While Op. The i'th scope storages temporary "
+              "variables generated in the i'th step.");
+    AddAttr<framework::BlockDesc *>(kStepBlock,
+                                    "The step block inside WhileOp");
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+
+class WhileGradOp : public framework::OperatorBase {
+ public:
+  WhileGradOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    framework::Executor executor(dev_place);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+    auto *program = block->Program();
+
+    auto *step_scopes =
+        scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>();
+
+    auto outside_og_names = Inputs(framework::GradVarName(kOutputs));
+    auto inside_og_names =
+        Attr<std::vector<std::string>>("original_output_grad");
+
+    PADDLE_ENFORCE_EQ(outside_og_names.size(), inside_og_names.size());
+
+    for (auto cur_scope_iter = step_scopes->rbegin();
+         cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) {
+      VLOG(3) << "Start backward at time_step "
+              << cur_scope_iter - step_scopes->rbegin();
+      framework::Scope &cur_scope = **cur_scope_iter;
+      // Link OG from outside to inside
+      for (size_t i = 0; i < outside_og_names.size(); ++i) {
+        auto outside_og_name = outside_og_names[i];
+        auto inside_og_name = inside_og_names[i];
+        VLOG(8) << "Linking outside " << outside_og_name << " --> inside "
+                << inside_og_name;
+        auto &og_outside =
+            detail::Ref(scope.FindVar(outside_og_name),
+                        "Cannot find Outside Gradient %s", outside_og_name);
+        auto &og_inside =
+            detail::Ref(cur_scope.Var(inside_og_name),
+                        "Cannot find inside gradient %s", inside_og_name);
+        if (og_outside.Type().hash_code() ==
+            typeid(framework::LoDTensor).hash_code()) {
+          auto &outside_tensor = og_outside.Get<framework::LoDTensor>();
+          auto &inside_tensor =
+              detail::Ref(og_inside.GetMutable<framework::LoDTensor>());
+          inside_tensor.set_lod(outside_tensor.lod());
+          inside_tensor.ShareDataWith(outside_tensor);
+        } else if (og_outside.Type().hash_code() ==
+                   typeid(framework::LoDTensorArray).hash_code()) {
+          auto &outside_array = og_outside.Get<framework::LoDTensorArray>();
+          auto &inside_array =
+              detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
+          VLOG(8) << outside_og_name << " size = " << outside_array.size();
+          inside_array.resize(outside_array.size());
+
+          for (size_t j = 0; j < inside_array.size(); ++j) {
+            VLOG(8) << j << " " << outside_array[j].numel();
+            if (outside_array[j].numel() != 0) {
+              inside_array[j].set_lod(outside_array[j].lod());
+              inside_array[j].ShareDataWith(outside_array[j]);
+            } else {
+              PADDLE_ENFORCE_EQ(inside_array[j].numel(), 0);
+            }
+          }
+        }
+      }
+
+      executor.Run(*program, *cur_scope_iter, block->ID(), false);
+
+      auto &pg_names = Outputs(kXGRAD);
+      auto &p_names = Inputs(kX);
+      PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
+      for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
+        if (pg_names[param_id] == framework::kEmptyVarName) {
+          continue;  // parameter doesn't have gradient
+        }
+        auto inside_grad_name = framework::GradVarName(p_names[param_id]);
+
+        //  // TODO(tonyyang-svail): Not sure we need the following
+        //  // If does not compute gradient of that variable inside rnn,
+        //  just
+        //  // continue
+        //  if (local_var_names.find(inside_grad_name) ==
+        //  local_var_names.end()) {
+        //    continue;
+        //  }
+
+        // zero gradient variable in step 0
+        if (cur_scope_iter == step_scopes->rbegin()) {
+          auto *var = (*cur_scope_iter)->FindVar(inside_grad_name);
+          PADDLE_ENFORCE_NOT_NULL(var, "Can not find var %s", inside_grad_name);
+          if (var->IsType<LoDTensor>()) {
+            auto &inside_tensor = var->Get<framework::LoDTensor>();
+            framework::AttributeMap attrs;
+            attrs["dtype"] = framework::ToDataType(inside_tensor.type());
+            attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
+            attrs["value"] = 0.0f;
+
+            auto var_name = pg_names[param_id];
+            auto zero_op = framework::OpRegistry::CreateOp(
+                "fill_constant", framework::VariableNameMap{},
+                {{"Out", {var_name}}}, attrs);
+            zero_op->Run(scope, dev_place);
+            scope.FindVar(var_name)
+                ->GetMutable<framework::LoDTensor>()
+                ->set_lod(inside_tensor.lod());
+          }
+        }
+
+        auto new_inside_name = cur_scope.Rename(inside_grad_name);
+        auto sum_op = framework::OpRegistry::CreateOp(
+            "sum", {{"X", {pg_names[param_id], new_inside_name}}},
+            {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
+        sum_op->Run(cur_scope, dev_place);
+        cur_scope.Rename(new_inside_name, inside_grad_name);
+      }
+      dev_ctx.Wait();
+      const_cast<framework::Scope &>(scope).DeleteScope(&cur_scope);
+    }
+  }
+};
+
+class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *while_grad = new framework::OpDesc();
+    while_grad->SetType("while_grad");
+    while_grad->SetInput(kX, Input(kX));
+    while_grad->SetInput(kOutputs, Output(kOutputs));
+    while_grad->SetInput(kStepScopes, Output(kStepScopes));
+
+    auto *grad_block = this->grad_block_[0];
+    auto *fwd_block = grad_block->ParentBlock();
+
+    // Not all of IGs will be generated by inner gradient operators of while op.
+    // Ignore IGs that is not generated by the inside block.
+    std::unordered_set<std::string> inner_op_outputs;
+    for (const auto *op : grad_block->AllOps()) {
+      for (auto &oname : op->OutputArgumentNames()) {
+        inner_op_outputs.insert(oname);
+      }
+    }
+    auto igs = InputGrad(kX, /*do not drop empty gradient*/ false);
+    for (auto &each_ig : igs) {
+      if (inner_op_outputs.find(each_ig) == inner_op_outputs.end()) {
+        VLOG(8) << "Ignore " << each_ig;
+        each_ig = framework::kEmptyVarName;
+      }
+    }
+    while_grad->SetOutput(framework::GradVarName(kX), igs);
+
+    // OG should be re-calculated by step blocks, since many outputs of while op
+    // do not need to calculate gradients.
+    std::unordered_set<std::string> block_ins;
+    block_ins.reserve(Input(kX).size() + Output(kOutputs).size());
+    for (auto &p : Input(kX)) {
+      block_ins.insert(p);
+    }
+    for (auto &o : Output(kOutputs)) {
+      block_ins.insert(o);
+    }
+    std::unordered_set<std::string> extra_inputs;
+    for (const auto *op : grad_block->AllOps()) {
+      for (auto &input_name : op->InputArgumentNames()) {
+        // If the input of Op has been recorded or is generated by the forward
+        // block, do not make it as input again.
+        if (block_ins.find(input_name) != block_ins.end() ||
+            fwd_block->FindVar(input_name) != nullptr) {
+          continue;
+        }
+        extra_inputs.insert(input_name);
+      }
+      for (auto &output_name : op->OutputArgumentNames()) {
+        block_ins.insert(output_name);
+      }
+    }
+
+    std::vector<std::string> extra_inputs_list;
+    extra_inputs_list.resize(extra_inputs.size());
+    std::copy(extra_inputs.begin(), extra_inputs.end(),
+              extra_inputs_list.begin());
+    while_grad->SetInput(framework::GradVarName(kOutputs), extra_inputs_list);
+
+    while_grad->SetAttrMap(this->Attrs());
+    while_grad->SetBlockAttr(kStepBlock, *grad_block);
+    // record the original output gradient names, since the gradient name of
+    // while operator could be renamed.
+    while_grad->SetAttr("original_output_grad", extra_inputs_list);
+
+    return std::unique_ptr<framework::OpDesc>(while_grad);
+  }
+};
+
+class WhileGradOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto p_names = op_desc.Input(kX);
+    auto pg_names = op_desc.Output(framework::GradVarName(kX));
+
+    for (size_t i = 0; i < p_names.size(); ++i) {
+      auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
+      auto *g_var = block->FindVarRecursive(pg_names[i]);
+      if (g_var != nullptr) {  // Gradient could be @EMPTY@
+        VLOG(5) << "Setting " << pg_names[i] << " following " << p_names[i]
+                << " type: " << p_var.GetType();
+        g_var->SetType(p_var.GetType());
+        g_var->SetDataType(p_var.GetDataType());
+      }
+    }
+  }
+};
+
+class WhileGradOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    ctx->HasInputs(kX);
+    ctx->HasOutputs(framework::GradVarName(kX));
+    ctx->HasInputs(kOutputs);
+    ctx->HasInputs(framework::GradVarName(kOutputs));
+
+    auto p_names = ctx->Inputs(kX);
+    auto pg_names = ctx->Outputs(kXGRAD);
+    auto var_types = ctx->GetInputsVarType(kX);
+    std::vector<std::string> names_to_set;
+    std::vector<framework::DDim> dims_to_set;
+    for (size_t i = 0; i < p_names.size(); ++i) {
+      if (pg_names[i] == framework::kEmptyVarName) {
+        continue;
+      }
+      auto dims = ctx->GetInputsElementDim(kX, i);
+      if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR) {
+        names_to_set.push_back(pg_names[i]);
+        dims_to_set.push_back(dims);
+      } else if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR_ARRAY) {
+        // not sure how to set the dim of LOD_TENSOR_ARRAY
+        names_to_set.push_back(pg_names[i]);
+        dims_to_set.push_back(dims);
+      }
+    }
+    ctx->SetDims(names_to_set, dims_to_set);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(while, paddle::operators::WhileOp,
+                  paddle::operators::WhileOpMaker,
+                  paddle::operators::WhileGradOpDescMaker);
+REGISTER_OPERATOR(while_grad, paddle::operators::WhileGradOp,
+                  paddle::operators::WhileGradOpShapeInference,
+                  paddle::operators::WhileGradOpVarTypeInference);
diff --git a/paddle/fluid/platform/.clang-format b/paddle/fluid/platform/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..29282dc87e2c499988c17d90d47d44cd5cf7f115
--- /dev/null
+++ b/paddle/fluid/platform/.clang-format
@@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11 
+...
diff --git a/paddle/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
similarity index 100%
rename from paddle/platform/CMakeLists.txt
rename to paddle/fluid/platform/CMakeLists.txt
diff --git a/paddle/platform/assert.h b/paddle/fluid/platform/assert.h
similarity index 100%
rename from paddle/platform/assert.h
rename to paddle/fluid/platform/assert.h
diff --git a/paddle/platform/call_once.h b/paddle/fluid/platform/call_once.h
similarity index 100%
rename from paddle/platform/call_once.h
rename to paddle/fluid/platform/call_once.h
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..47473aead0e512005a63e60b01170b41500dd1f6
--- /dev/null
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/cpu_info.h"
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#else
+#include <unistd.h>
+#endif
+
+#include "gflags/gflags.h"
+
+DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+              "Default use 100% of CPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+
+namespace paddle {
+namespace platform {
+
+inline size_t CpuTotalPhysicalMemory() {
+#ifdef __APPLE__
+  int mib[2];
+  mib[0] = CTL_HW;
+  mib[1] = HW_MEMSIZE;
+  int64_t size = 0;
+  size_t len = sizeof(size);
+  if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
+  return 0L;
+#else
+  int64_t pages = sysconf(_SC_PHYS_PAGES);
+  int64_t page_size = sysconf(_SC_PAGE_SIZE);
+  return pages * page_size;
+#endif
+}
+
+size_t CpuMaxAllocSize() {
+  // For distributed systems, it requires configuring and limiting
+  // the fraction of memory to use.
+  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
+}
+
+size_t CpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 4 KB.
+  return 1 << 12;
+}
+
+size_t CpuMaxChunkSize() {
+  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
+  return CpuMaxAllocSize() / 32;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
similarity index 100%
rename from paddle/platform/cpu_info.h
rename to paddle/fluid/platform/cpu_info.h
diff --git a/paddle/fluid/platform/cpu_info_test.cc b/paddle/fluid/platform/cpu_info_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d1fdba13b80629902faf2bcebb646572b36b459e
--- /dev/null
+++ b/paddle/fluid/platform/cpu_info_test.cc
@@ -0,0 +1,34 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/string/printf.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+DECLARE_double(fraction_of_cpu_memory_to_use);
+
+TEST(CpuMemoryUsage, Print) {
+  std::stringstream ss;
+  size_t memory_size = paddle::platform::CpuMaxAllocSize() / 1024 / 1024 / 1024;
+  float use_percent = FLAGS_fraction_of_cpu_memory_to_use * 100;
+
+  std::cout << paddle::string::Sprintf("\n%.2f %% of CPU Memory Usage: %d GB\n",
+                                       use_percent, memory_size)
+            << std::endl;
+}
diff --git a/paddle/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h
similarity index 100%
rename from paddle/platform/cuda_helper.h
rename to paddle/fluid/platform/cuda_helper.h
diff --git a/paddle/platform/cuda_profiler.h b/paddle/fluid/platform/cuda_profiler.h
similarity index 100%
rename from paddle/platform/cuda_profiler.h
rename to paddle/fluid/platform/cuda_profiler.h
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2daa4f4fcc6fe43c2950b413024df7e301abf50
--- /dev/null
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -0,0 +1,286 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/platform/dynload/cudnn.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace platform {
+
+inline const char* cudnnGetErrorString(cudnnStatus_t status) {
+  switch (status) {
+    case CUDNN_STATUS_SUCCESS:
+      return "CUDNN_STATUS_SUCCESS";
+    case CUDNN_STATUS_NOT_INITIALIZED:
+      return "CUDNN_STATUS_NOT_INITIALIZED";
+    case CUDNN_STATUS_ALLOC_FAILED:
+      return "CUDNN_STATUS_ALLOC_FAILED";
+    case CUDNN_STATUS_BAD_PARAM:
+      return "CUDNN_STATUS_BAD_PARAM";
+    case CUDNN_STATUS_INTERNAL_ERROR:
+      return "CUDNN_STATUS_INTERNAL_ERROR";
+    case CUDNN_STATUS_INVALID_VALUE:
+      return "CUDNN_STATUS_INVALID_VALUE";
+    case CUDNN_STATUS_ARCH_MISMATCH:
+      return "CUDNN_STATUS_ARCH_MISMATCH";
+    case CUDNN_STATUS_MAPPING_ERROR:
+      return "CUDNN_STATUS_MAPPING_ERROR";
+    case CUDNN_STATUS_EXECUTION_FAILED:
+      return "CUDNN_STATUS_EXECUTION_FAILED";
+    case CUDNN_STATUS_NOT_SUPPORTED:
+      return "CUDNN_STATUS_NOT_SUPPORTED";
+    case CUDNN_STATUS_LICENSE_ERROR:
+      return "CUDNN_STATUS_LICENSE_ERROR";
+    default:
+      return "Unknown cudnn error number";
+  }
+}
+
+#define CUDNN_VERSION_MIN(major, minor, patch) \
+  (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
+
+#define CUDNN_ENFORCE(condition)                                  \
+  do {                                                            \
+    cudnnStatus_t status = condition;                             \
+    if (status != CUDNN_STATUS_SUCCESS) {                         \
+      VLOG(1) << ::paddle::platform::cudnnGetErrorString(status); \
+      PADDLE_THROW("cuDNN call failed");                          \
+    }                                                             \
+  } while (false)
+
+enum class DataLayout {  // Not use
+  kNHWC,
+  kNCHW,
+  kNCDHW,
+  kNCHW_VECT_C,
+};
+
+enum class PoolingMode {
+  kMaximum,
+  kAverage,
+};
+
+template <typename T>
+class CudnnDataType;
+
+template <>
+class CudnnDataType<float> {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
+  typedef const float ScalingParamType;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
+template <>
+class CudnnDataType<double> {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
+  typedef const double ScalingParamType;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
+inline cudnnTensorFormat_t GetCudnnTensorFormat(
+    const DataLayout& order) {  // Not use
+  switch (order) {
+    case DataLayout::kNHWC:
+      return CUDNN_TENSOR_NHWC;
+    case DataLayout::kNCHW:
+      return CUDNN_TENSOR_NCHW;
+    case DataLayout::kNCDHW:
+      return CUDNN_TENSOR_NCHW;  // NOTE: cudnn treat NdTensor as the same
+    default:
+      PADDLE_THROW("Unknown cudnn equivalent for order");
+  }
+  return CUDNN_TENSOR_NCHW;
+}
+
+class ScopedTensorDescriptor {
+ public:
+  ScopedTensorDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnCreateTensorDescriptor(&desc_));
+  }
+  ~ScopedTensorDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnDestroyTensorDescriptor(desc_));
+  }
+
+  inline cudnnTensorDescriptor_t descriptor(const cudnnTensorFormat_t format,
+                                            const cudnnDataType_t type,
+                                            const std::vector<int>& dims,
+                                            const int groups = 1) {
+    // the format is not used now, will add later
+    std::vector<int> strides(dims.size());
+    strides[dims.size() - 1] = 1;
+    for (int i = dims.size() - 2; i >= 0; i--) {
+      strides[i] = dims[i + 1] * strides[i + 1];
+    }
+    // Update tensor descriptor dims setting if groups > 1
+    // NOTE: Assume using NCHW or NCDHW order
+    std::vector<int> dims_with_group(dims.begin(), dims.end());  // copy
+    if (groups > 1) {
+      dims_with_group[1] = dims_with_group[1] / groups;
+    }
+    PADDLE_ENFORCE(dynload::cudnnSetTensorNdDescriptor(
+        desc_, type, dims_with_group.size(), dims_with_group.data(),
+        strides.data()));
+    return desc_;
+  }
+
+  template <typename T>
+  inline cudnnTensorDescriptor_t descriptor(const DataLayout& order,
+                                            const std::vector<int>& dims,
+                                            const int groups = 1) {
+    return descriptor(GetCudnnTensorFormat(order), CudnnDataType<T>::type, dims,
+                      groups);
+  }
+
+ private:
+  cudnnTensorDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor);
+};
+
+class ScopedFilterDescriptor {
+ public:
+  ScopedFilterDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnCreateFilterDescriptor(&desc_));
+  }
+  ~ScopedFilterDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnDestroyFilterDescriptor(desc_));
+  }
+
+  inline cudnnFilterDescriptor_t descriptor(const cudnnTensorFormat_t format,
+                                            const cudnnDataType_t type,
+                                            const std::vector<int>& kernel,
+                                            const int groups = 1) {
+    // filter layout: MCHW(MCDHW), where M is the number of
+    // output image channels, C is the number of input image channels,
+    // D is the depth of the filter, H is the height of the filter, and W is the
+    // width of the filter.
+    std::vector<int> kernel_with_group(kernel.begin(), kernel.end());
+    if (groups > 1) {
+      kernel_with_group[0] /= groups;
+      // NOTE: input filter(C) of the filter is already asserted to be C/groups.
+    }
+    PADDLE_ENFORCE(dynload::cudnnSetFilterNdDescriptor(
+        desc_, type, format, kernel_with_group.size(),
+        kernel_with_group.data()));
+    return desc_;
+  }
+
+  template <typename T>
+  inline cudnnFilterDescriptor_t descriptor(const DataLayout& order,
+                                            const std::vector<int>& kernel,
+                                            const int groups = 1) {
+    return descriptor(GetCudnnTensorFormat(order), CudnnDataType<T>::type,
+                      kernel, groups);
+  }
+
+ private:
+  cudnnFilterDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedFilterDescriptor);
+};
+
+class ScopedConvolutionDescriptor {
+ public:
+  ScopedConvolutionDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnCreateConvolutionDescriptor(&desc_));
+  }
+  ~ScopedConvolutionDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnDestroyConvolutionDescriptor(desc_));
+  }
+
+  inline cudnnConvolutionDescriptor_t descriptor(
+      cudnnDataType_t type, const std::vector<int>& pads,
+      const std::vector<int>& strides, const std::vector<int>& dilations) {
+    PADDLE_ENFORCE_EQ(pads.size(), strides.size());
+    PADDLE_ENFORCE_EQ(pads.size(), dilations.size());
+
+#if !CUDNN_VERSION_MIN(6, 0, 0)
+    // cudnn v5 does not support dilation conv, the argument is called upscale
+    // instead of dilations and it is must be one.
+    for (size_t i = 0; i < dilations.size(); ++i) {
+      PADDLE_ENFORCE_EQ(
+          dilations[i], 1,
+          "Dilations conv is not supported in this cuDNN version(%d.%d.%d).",
+          CUDNN_VERSION / 1000, CUDNN_VERSION % 1000 / 100,
+          CUDNN_VERSION % 100);
+    }
+#endif
+
+    PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor(
+        desc_, pads.size(), pads.data(), strides.data(), dilations.data(),
+        CUDNN_CROSS_CORRELATION, type));
+    return desc_;
+  }
+
+  template <typename T>
+  inline cudnnConvolutionDescriptor_t descriptor(
+      const std::vector<int>& pads, const std::vector<int>& strides,
+      const std::vector<int>& dilations) {
+    return descriptor(CudnnDataType<T>::type, pads, strides, dilations);
+  }
+
+ private:
+  cudnnConvolutionDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedConvolutionDescriptor);
+};
+
+class ScopedPoolingDescriptor {
+ public:
+  ScopedPoolingDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnCreatePoolingDescriptor(&desc_));
+  }
+  ~ScopedPoolingDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnDestroyPoolingDescriptor(desc_));
+  }
+
+  inline cudnnPoolingDescriptor_t descriptor(const PoolingMode& mode,
+                                             const std::vector<int>& kernel,
+                                             const std::vector<int>& pads,
+                                             const std::vector<int>& strides) {
+    PADDLE_ENFORCE_EQ(kernel.size(), pads.size());
+    PADDLE_ENFORCE_EQ(kernel.size(), strides.size());
+    PADDLE_ENFORCE(dynload::cudnnSetPoolingNdDescriptor(
+        desc_, (mode == PoolingMode::kMaximum
+                    ? CUDNN_POOLING_MAX
+                    : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING),
+        CUDNN_PROPAGATE_NAN,  // Always propagate nans.
+        kernel.size(), kernel.data(), pads.data(), strides.data()));
+    return desc_;
+  }
+
+ private:
+  cudnnPoolingDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cudnn_helper_test.cc b/paddle/fluid/platform/cudnn_helper_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cd0bd3fe3ed115c4a91723e1023851456da74890
--- /dev/null
+++ b/paddle/fluid/platform/cudnn_helper_test.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/cudnn_helper.h"
+#include <gtest/gtest.h>
+
+TEST(CudnnHelper, ScopedTensorDescriptor) {
+  using paddle::platform::ScopedTensorDescriptor;
+  using paddle::platform::DataLayout;
+
+  ScopedTensorDescriptor tensor_desc;
+  std::vector<int> shape = {2, 4, 6, 6};
+  auto desc = tensor_desc.descriptor<float>(DataLayout::kNCHW, shape);
+
+  cudnnDataType_t type;
+  int nd;
+  std::vector<int> dims(4);
+  std::vector<int> strides(4);
+  paddle::platform::dynload::cudnnGetTensorNdDescriptor(
+      desc, 4, &type, &nd, dims.data(), strides.data());
+
+  EXPECT_EQ(nd, 4);
+  for (size_t i = 0; i < dims.size(); ++i) {
+    EXPECT_EQ(dims[i], shape[i]);
+  }
+  EXPECT_EQ(strides[3], 1);
+  EXPECT_EQ(strides[2], 6);
+  EXPECT_EQ(strides[1], 36);
+  EXPECT_EQ(strides[0], 144);
+
+  // test tensor5d: ScopedTensorDescriptor
+  ScopedTensorDescriptor tensor5d_desc;
+  std::vector<int> shape_5d = {2, 4, 6, 6, 6};
+  auto desc_5d = tensor5d_desc.descriptor<float>(DataLayout::kNCDHW, shape_5d);
+
+  std::vector<int> dims_5d(5);
+  std::vector<int> strides_5d(5);
+  paddle::platform::dynload::cudnnGetTensorNdDescriptor(
+      desc_5d, 5, &type, &nd, dims_5d.data(), strides_5d.data());
+
+  EXPECT_EQ(nd, 5);
+  for (size_t i = 0; i < dims_5d.size(); ++i) {
+    EXPECT_EQ(dims_5d[i], shape_5d[i]);
+  }
+  EXPECT_EQ(strides_5d[4], 1);
+  EXPECT_EQ(strides_5d[3], 6);
+  EXPECT_EQ(strides_5d[2], 36);
+  EXPECT_EQ(strides_5d[1], 216);
+  EXPECT_EQ(strides_5d[0], 864);
+}
+
+TEST(CudnnHelper, ScopedFilterDescriptor) {
+  using paddle::platform::ScopedFilterDescriptor;
+  using paddle::platform::DataLayout;
+
+  ScopedFilterDescriptor filter_desc;
+  std::vector<int> shape = {2, 3, 3};
+  auto desc = filter_desc.descriptor<float>(DataLayout::kNCHW, shape);
+
+  cudnnDataType_t type;
+  int nd;
+  cudnnTensorFormat_t format;
+  std::vector<int> kernel(3);
+  paddle::platform::dynload::cudnnGetFilterNdDescriptor(desc, 3, &type, &format,
+                                                        &nd, kernel.data());
+
+  EXPECT_EQ(GetCudnnTensorFormat(DataLayout::kNCHW), format);
+  EXPECT_EQ(nd, 3);
+  for (size_t i = 0; i < shape.size(); ++i) {
+    EXPECT_EQ(kernel[i], shape[i]);
+  }
+
+  ScopedFilterDescriptor filter_desc_4d;
+  std::vector<int> shape_4d = {2, 3, 3, 3};
+  auto desc_4d = filter_desc.descriptor<float>(DataLayout::kNCDHW, shape_4d);
+
+  std::vector<int> kernel_4d(4);
+  paddle::platform::dynload::cudnnGetFilterNdDescriptor(
+      desc_4d, 4, &type, &format, &nd, kernel_4d.data());
+
+  EXPECT_EQ(GetCudnnTensorFormat(DataLayout::kNCHW), format);
+  EXPECT_EQ(nd, 4);
+  for (size_t i = 0; i < shape_4d.size(); ++i) {
+    EXPECT_EQ(kernel_4d[i], shape_4d[i]);
+  }
+}
+
+TEST(CudnnHelper, ScopedConvolutionDescriptor) {
+  using paddle::platform::ScopedConvolutionDescriptor;
+
+  ScopedConvolutionDescriptor conv_desc;
+  std::vector<int> src_pads = {2, 2, 2};
+  std::vector<int> src_strides = {1, 1, 1};
+  std::vector<int> src_dilations = {1, 1, 1};
+  auto desc = conv_desc.descriptor<float>(src_pads, src_strides, src_dilations);
+
+  cudnnDataType_t type;
+  cudnnConvolutionMode_t mode;
+  int nd;
+  std::vector<int> pads(3);
+  std::vector<int> strides(3);
+  std::vector<int> dilations(3);
+  paddle::platform::dynload::cudnnGetConvolutionNdDescriptor(
+      desc, 3, &nd, pads.data(), strides.data(), dilations.data(), &mode,
+      &type);
+
+  EXPECT_EQ(nd, 3);
+  for (size_t i = 0; i < src_pads.size(); ++i) {
+    EXPECT_EQ(pads[i], src_pads[i]);
+    EXPECT_EQ(strides[i], src_strides[i]);
+    EXPECT_EQ(dilations[i], src_dilations[i]);
+  }
+  EXPECT_EQ(mode, CUDNN_CROSS_CORRELATION);
+}
+
+TEST(CudnnHelper, ScopedPoolingDescriptor) {
+  using paddle::platform::ScopedPoolingDescriptor;
+  using paddle::platform::PoolingMode;
+
+  ScopedPoolingDescriptor pool_desc;
+  std::vector<int> src_kernel = {2, 2, 5};
+  std::vector<int> src_pads = {1, 1, 2};
+  std::vector<int> src_strides = {2, 2, 3};
+  auto desc = pool_desc.descriptor(PoolingMode::kMaximum, src_kernel, src_pads,
+                                   src_strides);
+
+  cudnnPoolingMode_t mode;
+  cudnnNanPropagation_t nan_t = CUDNN_PROPAGATE_NAN;
+  int nd;
+  std::vector<int> kernel(3);
+  std::vector<int> pads(3);
+  std::vector<int> strides(3);
+  paddle::platform::dynload::cudnnGetPoolingNdDescriptor(
+      desc, 3, &mode, &nan_t, &nd, kernel.data(), pads.data(), strides.data());
+
+  EXPECT_EQ(nd, 3);
+  for (size_t i = 0; i < src_pads.size(); ++i) {
+    EXPECT_EQ(kernel[i], src_kernel[i]);
+    EXPECT_EQ(pads[i], src_pads[i]);
+    EXPECT_EQ(strides[i], src_strides[i]);
+  }
+  EXPECT_EQ(mode, CUDNN_POOLING_MAX);
+}
diff --git a/paddle/platform/details/device_ptr_cast.h b/paddle/fluid/platform/details/device_ptr_cast.h
similarity index 100%
rename from paddle/platform/details/device_ptr_cast.h
rename to paddle/fluid/platform/details/device_ptr_cast.h
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c4da846bb1c25abc3d31006657652abaa5a11add
--- /dev/null
+++ b/paddle/fluid/platform/device_context.cc
@@ -0,0 +1,236 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/memory/memory.h"
+
+namespace paddle {
+namespace platform {
+
+DeviceContextPool* DeviceContextPool::pool = nullptr;
+
+const platform::DeviceContext* DeviceContextPool::Get(
+    const platform::Place& place) {
+  auto it = device_contexts_.find(place);
+  if (it == device_contexts_.end()) {
+    PADDLE_THROW(
+        "'Place' is not supported, Please re-compile with WITH_GPU "
+        "option");
+  }
+  return it->second;
+}
+
+DeviceContextPool::DeviceContextPool(
+    const std::vector<platform::Place>& places) {
+  PADDLE_ENFORCE_GT(places.size(), 0);
+  for (size_t i = 0; i < places.size(); i++) {
+    if (platform::is_cpu_place(places[i])) {
+      device_contexts_.emplace(places[i],
+                               new platform::CPUDeviceContext(
+                                   boost::get<platform::CPUPlace>(places[i])));
+    } else if (platform::is_gpu_place(places[i])) {
+#ifdef PADDLE_WITH_CUDA
+      device_contexts_.emplace(places[i],
+                               new platform::CUDADeviceContext(
+                                   boost::get<platform::CUDAPlace>(places[i])));
+#else
+      PADDLE_THROW(
+          "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
+          "option");
+#endif
+    }
+  }
+}
+
+CPUDeviceContext::CPUDeviceContext() {
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+CPUDeviceContext::CPUDeviceContext(CPUPlace place) : place_(place) {
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
+  return eigen_device_.get();
+}
+
+Place CPUDeviceContext::GetPlace() const { return place_; }
+
+#ifdef PADDLE_WITH_CUDA
+
+class EigenCudaStreamDevice : public Eigen::StreamInterface {
+ public:
+  EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
+    Eigen::initializeDeviceProp();
+  }
+  ~EigenCudaStreamDevice() override {}
+
+  void Reinitialize(const cudaStream_t* cuda_stream, CUDAPlace place) {
+    stream_ = cuda_stream;
+    place_ = place;
+    device_prop_ = &Eigen::m_deviceProperties[place.device];
+  }
+
+  const cudaStream_t& stream() const override { return *stream_; }
+
+  const cudaDeviceProp& deviceProperties() const override {
+    return *device_prop_;
+  }
+
+  void* allocate(size_t num_bytes) const override {
+    return paddle::memory::Alloc(place_, num_bytes);
+  }
+
+  void deallocate(void* buffer) const override {
+    paddle::memory::Free(place_, buffer);
+  }
+
+  void* scratchpad() const override {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(Eigen::kCudaScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  unsigned int* semaphore() const override {
+    if (semaphore_ == NULL) {
+      char* scratch =
+          static_cast<char*>(scratchpad()) + Eigen::kCudaScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+      PADDLE_ENFORCE(
+          cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
+    }
+    return semaphore_;
+  }
+
+ private:
+  CUDAPlace place_;
+  const cudaStream_t* stream_;         // not owned;
+  const cudaDeviceProp* device_prop_;  // not owned;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
+};
+
+CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
+  SetDeviceId(place_.device);
+  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
+  eigen_stream_.reset(new EigenCudaStreamDevice());
+  eigen_stream_->Reinitialize(&stream_, place);
+  eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
+  PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
+  PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
+  if (dynload::HasCUDNN()) {
+    PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
+    PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
+  } else {
+    cudnn_handle_ = nullptr;
+  }
+}
+
+CUDADeviceContext::~CUDADeviceContext() {
+  SetDeviceId(place_.device);
+  Wait();
+  PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
+  if (cudnn_handle_ != nullptr) {
+    PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
+  }
+  eigen_stream_.reset();
+  eigen_device_.reset();
+  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
+}
+
+Place CUDADeviceContext::GetPlace() const { return place_; }
+
+void CUDADeviceContext::Wait() const {
+  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+  PADDLE_ENFORCE(cudaGetLastError());
+}
+
+Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
+  return eigen_device_.get();
+}
+
+cublasHandle_t CUDADeviceContext::cublas_handle() const {
+  return cublas_handle_;
+}
+
+cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
+
+cudaStream_t CUDADeviceContext::stream() const { return stream_; }
+
+#endif
+
+#ifdef PADDLE_WITH_MKLDNN
+MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
+    : CPUDeviceContext(place), ready_(false) {
+  stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+  engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
+}
+
+template <typename T>
+void MKLDNNDeviceContext::AddElement(const std::string& op_key,
+                                     const T& value) {
+  if (GetElement<T>(op_key)) {
+    return;
+  }
+  GetElementPool<T>().emplace(op_key, std::move(value));
+}
+
+template <typename T>
+const T& MKLDNNDeviceContext::GetElement(const std::string& op_key) const {
+  auto it = GetElementPool<T>().find(op_key);
+  return it == GetElementPool<T>().end() ? nullptr : it->second;
+}
+
+template <>
+const std::unordered_map<const std::string, const MKLDNNMemoryPtr,
+                         std::hash<std::string>>&
+MKLDNNDeviceContext::GetElementPool<MKLDNNMemoryPtr>() const {
+  return memory_pool_;
+}
+
+template <>
+const std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
+                         std::hash<std::string>>&
+MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitivePtr>() const {
+  return primitive_pool_;
+}
+
+template <>
+const std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
+                         std::hash<std::string>>&
+MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitiveDescPtr>() const {
+  return primitive_desc_pool_;
+}
+
+void MKLDNNDeviceContext::Execute(bool block) {
+  if (pipeline_.empty()) {
+    return;
+  }
+  ResetStream();
+  stream_->submit(pipeline_).wait(block);
+  ready_ = false;
+  pipeline_.clear();
+}
+
+void MKLDNNDeviceContext::ResetStream() {
+  if (ready_) {
+    return;
+  }
+  // TODO(TJ): change me when mkldnn have specific method to reset this state
+  stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+  ready_ = true;
+}
+
+#endif
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..10b581f41a1e9473a2f85d3e5d2e40ee1fdaa1af
--- /dev/null
+++ b/paddle/fluid/platform/device_context.h
@@ -0,0 +1,210 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#define EIGEN_USE_GPU
+#endif
+
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+#include "glog/logging.h"
+
+namespace paddle {
+namespace platform {
+
+class DeviceContext {
+ public:
+  virtual ~DeviceContext() {}
+  virtual Place GetPlace() const = 0;
+
+  virtual void Wait() const {}
+};
+
+class CPUDeviceContext : public DeviceContext {
+ public:
+  CPUDeviceContext();
+  explicit CPUDeviceContext(CPUPlace place);
+
+  Eigen::DefaultDevice* eigen_device() const;
+
+  Place GetPlace() const override;
+
+ private:
+  CPUPlace place_;
+  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+};
+
+template <typename Place>
+struct DefaultDeviceContextType;
+
+template <>
+struct DefaultDeviceContextType<platform::CPUPlace> {
+  using TYPE = CPUDeviceContext;
+};
+
+#ifdef PADDLE_WITH_CUDA
+
+class EigenCudaStreamDevice;
+
+class CUDADeviceContext : public DeviceContext {
+ public:
+  explicit CUDADeviceContext(CUDAPlace place);
+  virtual ~CUDADeviceContext();
+
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const override;
+
+  /*! \brief  Return place in the device context. */
+  Place GetPlace() const override;
+
+  /*! \brief  Return eigen device in the device context. */
+  Eigen::GpuDevice* eigen_device() const;
+
+  /*! \brief  Return cublas handle in the device context. */
+  cublasHandle_t cublas_handle() const;
+
+  /*! \brief  Return cudnn  handle in the device context. */
+  cudnnHandle_t cudnn_handle() const;
+
+  /*! \brief  Return cuda stream in the device context. */
+  cudaStream_t stream() const;
+
+ private:
+  CUDAPlace place_;
+
+  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
+  std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
+
+  cudaStream_t stream_;
+  cudnnHandle_t cudnn_handle_;
+  cublasHandle_t cublas_handle_;
+};
+
+template <>
+struct DefaultDeviceContextType<platform::CUDAPlace> {
+  using TYPE = CUDADeviceContext;
+};
+
+#endif
+
+#ifdef PADDLE_WITH_MKLDNN
+class MKLDNNDeviceContext : public CPUDeviceContext {
+ public:
+  explicit MKLDNNDeviceContext(CPUPlace place);
+
+  /* \brief  Add new element: memory, primitive or primitive desc */
+  template <typename T>
+  void AddElement(const std::string& op_key, const T& value);
+
+  /* \brief  Get existed element: memory, primitive or primitive desc */
+  template <typename T>
+  const T& GetElement(const std::string& op_key) const;
+
+  /* \brief  Get element pool: memory, primitive or primitive desc pool */
+  template <typename T>
+  const std::unordered_map<const std::string, const T, std::hash<std::string>>&
+  GetElementPool() const;
+
+  /* \brief  Get the active engine */
+  const MKLDNNEngine& engine() const { return *engine_; }
+
+  /* \brief  Submit primitive to pipeline */
+  void Submit(const MKLDNNPrimitivePtr& p) { pipeline_.push_back(*p); }
+
+  /*! \brief  Execute all submitted primitives in pipeline */
+  void Execute(bool block = true);
+
+ protected:
+  /*! \brief  Reset the stream to prepare next exectue */
+  void ResetStream();
+
+ private:
+  std::unordered_map<const std::string, const MKLDNNMemoryPtr,
+                     std::hash<std::string>>
+      memory_pool_;
+  std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
+                     std::hash<std::string>>
+      primitive_pool_;
+  std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
+                     std::hash<std::string>>
+      primitive_desc_pool_;
+  std::vector<MKLDNNPrimitive> pipeline_;
+  MKLDNNStreamPtr stream_;
+  MKLDNNEnginePtr engine_;
+  bool ready_;
+};
+#endif
+
+/*! \brief device context pool singleton */
+class DeviceContextPool {
+ public:
+  explicit DeviceContextPool(const std::vector<platform::Place>& places);
+
+  static DeviceContextPool& Instance() {
+    PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
+    return *pool;
+  }
+
+  /*! \brief  Create should only called by Init function */
+  static DeviceContextPool& Init(const std::vector<platform::Place>& places) {
+    if (pool == nullptr) {
+      pool = new DeviceContextPool(places);
+    }
+    return *pool;
+  }
+
+  /*! \brief  Return handle of single device context. */
+  const platform::DeviceContext* Get(const platform::Place& place);
+
+  template <typename Place>
+  const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
+      const Place& place) {
+    return reinterpret_cast<
+        const typename DefaultDeviceContextType<Place>::TYPE*>(Get(place));
+  }
+
+  size_t size() const { return device_contexts_.size(); }
+
+ private:
+  static DeviceContextPool* pool;
+  constexpr static int LEFT_SHIFT = 8;
+  struct Hash {
+    std::hash<int> hash_;
+    size_t operator()(const platform::Place& place) const {
+      int pre_hash = place.which() << LEFT_SHIFT;
+      if (platform::is_gpu_place(place)) {
+        pre_hash += boost::get<platform::CUDAPlace>(place).GetDeviceId();
+      }
+      return hash_(pre_hash);
+    }
+  };
+  std::unordered_map<const platform::Place, const platform::DeviceContext*,
+                     Hash>
+      device_contexts_;
+  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f4dae6e90a8d12fc2dccab6fd3c6881e58e80fed
--- /dev/null
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/platform/device_context.h"
+
+#include "glog/logging.h"
+
+TEST(Device, Init) {
+  using paddle::platform::DeviceContext;
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::CUDAPlace;
+
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; i++) {
+    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
+    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
+    ASSERT_NE(nullptr, gpu_device);
+    delete device_context;
+  }
+}
+
+TEST(Device, CUDADeviceContext) {
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::CUDAPlace;
+
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; i++) {
+    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
+    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
+    ASSERT_NE(nullptr, gpu_device);
+    cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
+    ASSERT_NE(nullptr, cudnn_handle);
+    cublasHandle_t cublas_handle = device_context->cublas_handle();
+    ASSERT_NE(nullptr, cublas_handle);
+    ASSERT_NE(nullptr, device_context->stream());
+    delete device_context;
+  }
+}
+
+TEST(Device, DeviceContextPool) {
+  using paddle::platform::DeviceContextPool;
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::Place;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::CUDAPlace;
+
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  auto cpu_dev_ctx1 = pool.Get(CPUPlace());
+  auto cpu_dev_ctx2 = pool.Get(CPUPlace());
+  ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1);
+
+  std::vector<Place> gpu_places;
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    auto dev_ctx = pool.Get(CUDAPlace(i));
+    ASSERT_NE(dev_ctx, nullptr);
+  }
+}
+
+int main(int argc, char** argv) {
+  std::vector<paddle::platform::Place> places;
+
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::CUDAPlace(i));
+  }
+
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Init(places);
+
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
similarity index 100%
rename from paddle/platform/dynload/CMakeLists.txt
rename to paddle/fluid/platform/dynload/CMakeLists.txt
diff --git a/paddle/fluid/platform/dynload/cublas.cc b/paddle/fluid/platform/dynload/cublas.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c599712554b1d6183c896eb7fc6ac5bbf71d67fc
--- /dev/null
+++ b/paddle/fluid/platform/dynload/cublas.cc
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/cublas.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag cublas_dso_flag;
+void *cublas_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
new file mode 100644
index 0000000000000000000000000000000000000000..05f69e506515ac092c8509fba26e5b7a0f0823f7
--- /dev/null
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cublas_v2.h>
+#include <dlfcn.h>
+#include <mutex>
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag cublas_dso_flag;
+extern void *cublas_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cublas routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#ifdef PADDLE_USE_DSO
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                    \
+  struct DynLoad__##__name {                                        \
+    template <typename... Args>                                     \
+    inline cublasStatus_t operator()(Args... args) {                \
+      typedef cublasStatus_t (*cublasFunc)(Args...);                \
+      std::call_once(cublas_dso_flag,                               \
+                     paddle::platform::dynload::GetCublasDsoHandle, \
+                     &cublas_dso_handle);                           \
+      void *p_##__name = dlsym(cublas_dso_handle, #__name);         \
+      return reinterpret_cast<cublasFunc>(p_##__name)(args...);     \
+    }                                                               \
+  };                                                                \
+  extern DynLoad__##__name __name
+#else
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)     \
+  struct DynLoad__##__name {                         \
+    template <typename... Args>                      \
+    inline cublasStatus_t operator()(Args... args) { \
+      return __name(args...);                        \
+    }                                                \
+  };                                                 \
+  extern DynLoad__##__name __name
+#endif
+
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
+  DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)
+
+#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(cublasSaxpy_v2);                \
+  __macro(cublasDaxpy_v2);                \
+  __macro(cublasSgemv_v2);                \
+  __macro(cublasDgemv_v2);                \
+  __macro(cublasSgemm_v2);                \
+  __macro(cublasDgemm_v2);                \
+  __macro(cublasSgeam_v2);                \
+  __macro(cublasDgeam_v2);                \
+  __macro(cublasCreate_v2);               \
+  __macro(cublasDestroy_v2);              \
+  __macro(cublasSetStream_v2);            \
+  __macro(cublasSetPointerMode_v2);       \
+  __macro(cublasGetPointerMode_v2);       \
+  __macro(cublasSgemmBatched);            \
+  __macro(cublasDgemmBatched);            \
+  __macro(cublasCgemmBatched);            \
+  __macro(cublasZgemmBatched);            \
+  __macro(cublasSgemmStridedBatched);     \
+  __macro(cublasDgemmStridedBatched);     \
+  __macro(cublasCgemmStridedBatched);     \
+  __macro(cublasZgemmStridedBatched);     \
+  __macro(cublasSgetrfBatched);           \
+  __macro(cublasSgetriBatched);           \
+  __macro(cublasDgetrfBatched);           \
+  __macro(cublasDgetriBatched)
+
+CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b1c4c4f9609ebb61674c07f6cfd615d9674dfcd
--- /dev/null
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/cudnn.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag cudnn_dso_flag;
+void* cudnn_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP);
+CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
+CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_R5
+CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_R7
+CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
+#endif
+
+#ifdef PADDLE_USE_DSO
+bool HasCUDNN() {
+  std::call_once(cudnn_dso_flag, GetCUDNNDsoHandle, &cudnn_dso_handle);
+  return cudnn_dso_handle != nullptr;
+}
+
+void EnforceCUDNNLoaded(const char* fn_name) {
+  PADDLE_ENFORCE(cudnn_dso_handle != nullptr,
+                 "Cannot load cudnn shared library. Cannot invoke method %s",
+                 fn_name);
+}
+#else
+bool HasCUDNN() { return true; }
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
new file mode 100644
index 0000000000000000000000000000000000000000..00dfbc83872ed04d2b4e840e3f6ac31e89a8a3cd
--- /dev/null
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cudnn.h>
+#include <dlfcn.h>
+#include <mutex>
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag cudnn_dso_flag;
+extern void* cudnn_dso_handle;
+extern bool HasCUDNN();
+
+#ifdef PADDLE_USE_DSO
+
+extern void EnforceCUDNNLoaded(const char* fn_name);
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                    \
+  struct DynLoad__##__name {                                       \
+    template <typename... Args>                                    \
+    auto operator()(Args... args) -> decltype(__name(args...)) {   \
+      using cudnn_func = decltype(__name(args...)) (*)(Args...);   \
+      std::call_once(cudnn_dso_flag,                               \
+                     paddle::platform::dynload::GetCUDNNDsoHandle, \
+                     &cudnn_dso_handle);                           \
+      EnforceCUDNNLoaded(#__name);                                 \
+      void* p_##__name = dlsym(cudnn_dso_handle, #__name);         \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);    \
+    }                                                              \
+  };                                                               \
+  extern struct DynLoad__##__name __name
+
+#else
+
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                  \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      return __name(args...);                                    \
+    }                                                            \
+  };                                                             \
+  extern DynLoad__##__name __name
+
+#endif
+
+/**
+ * include all needed cudnn functions in HPPL
+ * different cudnn version has different interfaces
+ **/
+#define CUDNN_DNN_ROUTINE_EACH(__macro)             \
+  __macro(cudnnSetTensor4dDescriptor);              \
+  __macro(cudnnSetTensor4dDescriptorEx);            \
+  __macro(cudnnSetTensorNdDescriptor);              \
+  __macro(cudnnGetTensorNdDescriptor);              \
+  __macro(cudnnGetConvolutionNdForwardOutputDim);   \
+  __macro(cudnnGetConvolutionForwardAlgorithm);     \
+  __macro(cudnnCreateTensorDescriptor);             \
+  __macro(cudnnDestroyTensorDescriptor);            \
+  __macro(cudnnCreateFilterDescriptor);             \
+  __macro(cudnnSetFilter4dDescriptor);              \
+  __macro(cudnnSetFilterNdDescriptor);              \
+  __macro(cudnnGetFilterNdDescriptor);              \
+  __macro(cudnnSetPooling2dDescriptor);             \
+  __macro(cudnnSetPoolingNdDescriptor);             \
+  __macro(cudnnGetPoolingNdDescriptor);             \
+  __macro(cudnnDestroyFilterDescriptor);            \
+  __macro(cudnnCreateConvolutionDescriptor);        \
+  __macro(cudnnCreatePoolingDescriptor);            \
+  __macro(cudnnDestroyPoolingDescriptor);           \
+  __macro(cudnnSetConvolution2dDescriptor);         \
+  __macro(cudnnDestroyConvolutionDescriptor);       \
+  __macro(cudnnSetConvolutionNdDescriptor);         \
+  __macro(cudnnGetConvolutionNdDescriptor);         \
+  __macro(cudnnDeriveBNTensorDescriptor);           \
+  __macro(cudnnCreate);                             \
+  __macro(cudnnDestroy);                            \
+  __macro(cudnnSetStream);                          \
+  __macro(cudnnActivationForward);                  \
+  __macro(cudnnConvolutionForward);                 \
+  __macro(cudnnConvolutionBackwardBias);            \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize); \
+  __macro(cudnnTransformTensor);                    \
+  __macro(cudnnPoolingForward);                     \
+  __macro(cudnnPoolingBackward);                    \
+  __macro(cudnnSoftmaxBackward);                    \
+  __macro(cudnnSoftmaxForward);                     \
+  __macro(cudnnGetVersion);                         \
+  __macro(cudnnGetErrorString);
+CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \
+  __macro(cudnnAddTensor);                 \
+  __macro(cudnnConvolutionBackwardData);   \
+  __macro(cudnnConvolutionBackwardFilter);
+CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+// APIs available after R3:
+#if CUDNN_VERSION >= 3000
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)           \
+  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm);       \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm);     \
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
+// APIs available after R4:
+#if CUDNN_VERSION >= 4007
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)    \
+  __macro(cudnnBatchNormalizationForwardTraining);  \
+  __macro(cudnnBatchNormalizationForwardInference); \
+  __macro(cudnnBatchNormalizationBackward);
+CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
+// APIs in R5
+#if CUDNN_VERSION >= 5000
+#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)  \
+  __macro(cudnnCreateActivationDescriptor); \
+  __macro(cudnnSetActivationDescriptor);    \
+  __macro(cudnnGetActivationDescriptor);    \
+  __macro(cudnnDestroyActivationDescriptor);
+CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
+#if CUDNN_VERSION >= 7001
+#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
+  __macro(cudnnSetConvolutionGroupCount);
+CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/curand.cc b/paddle/fluid/platform/dynload/curand.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eac690b1458a4e53f6958b77d01d2c3c9f26f6eb
--- /dev/null
+++ b/paddle/fluid/platform/dynload/curand.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/curand.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag curand_dso_flag;
+void *curand_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce3115b3ce0dbaa30af63bfc15908f81c16309d1
--- /dev/null
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <curand.h>
+#include <dlfcn.h>
+#include <mutex>
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+extern std::once_flag curand_dso_flag;
+extern void *curand_dso_handle;
+#ifdef PADDLE_USE_DSO
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                    \
+  struct DynLoad__##__name {                                        \
+    template <typename... Args>                                     \
+    curandStatus_t operator()(Args... args) {                       \
+      typedef curandStatus_t (*curandFunc)(Args...);                \
+      std::call_once(curand_dso_flag,                               \
+                     paddle::platform::dynload::GetCurandDsoHandle, \
+                     &curand_dso_handle);                           \
+      void *p_##__name = dlsym(curand_dso_handle, #__name);         \
+      return reinterpret_cast<curandFunc>(p_##__name)(args...);     \
+    }                                                               \
+  };                                                                \
+  extern DynLoad__##__name __name
+#else
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
+  struct DynLoad__##__name {                     \
+    template <typename... Args>                  \
+    curandStatus_t operator()(Args... args) {    \
+      return __name(args...);                    \
+    }                                            \
+  };                                             \
+  extern DynLoad__##__name __name
+#endif
+
+#define CURAND_RAND_ROUTINE_EACH(__macro)      \
+  __macro(curandCreateGenerator);              \
+  __macro(curandSetStream);                    \
+  __macro(curandSetPseudoRandomGeneratorSeed); \
+  __macro(curandGenerateUniform);              \
+  __macro(curandGenerateUniformDouble);        \
+  __macro(curandGenerateNormal);               \
+  __macro(curandDestroyGenerator);
+
+CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb00f93b7cde0a39beb4adabd910eef634c2581c
--- /dev/null
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -0,0 +1,180 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include <dlfcn.h>
+#include <memory>
+#include <mutex>
+#include <string>
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "paddle/fluid/platform/enforce.h"
+
+DEFINE_string(cudnn_dir, "",
+              "Specify path for loading libcudnn.so. For instance, "
+              "/usr/local/cudnn/lib. If empty [default], dlopen "
+              "will search cudnn from LD_LIBRARY_PATH");
+
+DEFINE_string(cuda_dir, "",
+              "Specify path for loading cuda library, such as libcublas, "
+              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
+              "dlopen will search cuda from LD_LIBRARY_PATH");
+
+DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
+
+DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
+
+DEFINE_string(nccl_dir, "",
+              "Specify path for loading nccl library, such as libcublas, "
+              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
+              "dlopen will search cuda from LD_LIBRARY_PATH");
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+static inline std::string join(const std::string& part1,
+                               const std::string& part2) {
+  // directory separator
+  const char sep = '/';
+  if (!part2.empty() && part2.front() == sep) {
+    return part2;
+  }
+  std::string ret;
+  ret.reserve(part1.size() + part2.size() + 1);
+  ret = part1;
+  if (!ret.empty() && ret.back() != sep) {
+    ret += sep;
+  }
+  ret += part2;
+  return ret;
+}
+
+static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
+                                               void** dso_handle,
+                                               int dynload_flags) {
+  VLOG(3) << "Try to find library: " << dso_path
+          << " from default system path.";
+  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+  *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+
+// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
+// bring System Integrity Projection (SIP), if dso_handle
+// is null, search from default package path in Mac OS.
+#if defined(__APPLE__) || defined(__OSX__)
+  if (nullptr == *dso_handle) {
+    dso_path = join("/usr/local/cuda/lib/", dso_path);
+    *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+    if (nullptr == *dso_handle) {
+      if (dso_path == "libcudnn.dylib") {
+        LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
+                        "For instance, sudo tar -xzf "
+                        "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
+                        "chmod a+r /usr/local/cuda/include/cudnn.h "
+                        "/usr/local/cuda/lib/libcudnn*";
+      }
+    }
+  }
+#endif
+}
+
+static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
+                                              const std::string& dso_name,
+                                              void** dso_handle,
+                                              bool throw_on_error = true) {
+  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+  *dso_handle = nullptr;
+
+  std::string dlPath = dso_name;
+  if (search_root.empty()) {
+    GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+  } else {
+    // search xxx.so from custom path
+    dlPath = join(search_root, dso_name);
+    *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+    // if not found, search from default path
+    if (nullptr == *dso_handle) {
+      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
+                   << dlerror() << ")";
+      dlPath = dso_name;
+      GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+    }
+  }
+  auto error_msg =
+      "Failed to find dynamic library: %s ( %s ) \n Please specify "
+      "its path correctly using following ways: \n Method. set "
+      "environment variable LD_LIBRARY_PATH on Linux or "
+      "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
+      "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
+      "using the DYLD_LIBRARY_PATH is impossible unless System "
+      "Integrity Protection (SIP) is disabled.";
+  if (throw_on_error) {
+    PADDLE_ENFORCE(nullptr != *dso_handle, error_msg, dlPath, dlerror());
+  } else if (nullptr == *dso_handle) {
+    LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror());
+  }
+}
+
+void GetCublasDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+#endif
+}
+
+void GetCUDNNDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle,
+                             false);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle, false);
+#endif
+}
+
+void GetCurandDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+#endif
+}
+
+void GetWarpCTCDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
+#endif
+}
+
+void GetLapackDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
+#endif
+}
+
+void GetNCCLDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", dso_handle);
+#endif
+}
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
similarity index 100%
rename from paddle/platform/dynload/dynamic_loader.h
rename to paddle/fluid/platform/dynload/dynamic_loader.h
diff --git a/paddle/fluid/platform/dynload/nccl.cc b/paddle/fluid/platform/dynload/nccl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1dc3e96f04a4f98e5cf0cb7848a99213ca082b79
--- /dev/null
+++ b/paddle/fluid/platform/dynload/nccl.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/nccl.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag nccl_dso_flag;
+void *nccl_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+void LoadNCCLDSO() {
+  platform::call_once(nccl_dso_flag,
+                      [] { GetNCCLDsoHandle(&nccl_dso_handle); });
+}
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
new file mode 100644
index 0000000000000000000000000000000000000000..349a4d0ba325fe1f7c23873c82dce0d35d295340
--- /dev/null
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <dlfcn.h>
+#include <nccl.h>
+#include <mutex>
+#include "paddle/fluid/platform/call_once.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag nccl_dso_flag;
+extern void* nccl_dso_handle;
+
+#ifdef PADDLE_USE_DSO
+extern void LoadNCCLDSO();
+
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                   \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      using nccl_func = decltype(__name(args...)) (*)(Args...);  \
+      paddle::platform::dynload::LoadNCCLDSO();                  \
+      void* p_##__name = dlsym(nccl_dso_handle, #__name);        \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);   \
+    }                                                            \
+  };                                                             \
+  extern DynLoad__##__name __name
+#else
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
+  struct DynLoad__##__name {                   \
+    template <typename... Args>                \
+    ncclResult_t operator()(Args... args) {    \
+      return __name(args...);                  \
+    }                                          \
+  };                                           \
+  extern DynLoad__##__name __name
+#endif
+
+#define NCCL_RAND_ROUTINE_EACH(__macro) \
+  __macro(ncclCommInitAll);             \
+  __macro(ncclGetUniqueId);             \
+  __macro(ncclCommInitRank);            \
+  __macro(ncclCommDestroy);             \
+  __macro(ncclCommCount);               \
+  __macro(ncclCommCuDevice);            \
+  __macro(ncclCommUserRank);            \
+  __macro(ncclAllReduce);               \
+  __macro(ncclBcast);                   \
+  __macro(ncclAllGather);               \
+  __macro(ncclGroupStart);              \
+  __macro(ncclGroupEnd);                \
+  __macro(ncclReduce);                  \
+  __macro(ncclGetErrorString);
+
+NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/warpctc.cc b/paddle/fluid/platform/dynload/warpctc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..84de2cae94790ca44c8c0f87d75a45a1b7001a64
--- /dev/null
+++ b/paddle/fluid/platform/dynload/warpctc.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/warpctc.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag warpctc_dso_flag;
+void* warpctc_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+WARPCTC_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h
new file mode 100644
index 0000000000000000000000000000000000000000..f1955818dede54b0d9c53cb33b95c6102b854d7b
--- /dev/null
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <dlfcn.h>
+#include <mutex>
+#include "ctc.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag warpctc_dso_flag;
+extern void* warpctc_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load warpctc routine
+ * via operator overloading.
+ */
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                            \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> decltype(__name(args...)) {     \
+      using warpctcFunc = decltype(__name(args...)) (*)(Args...);    \
+      std::call_once(warpctc_dso_flag,                               \
+                     paddle::platform::dynload::GetWarpCTCDsoHandle, \
+                     &warpctc_dso_handle);                           \
+      void* p_##_name = dlsym(warpctc_dso_handle, #__name);          \
+      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
+  DYNAMIC_LOAD_WARPCTC_WRAP(__name)
+
+#define WARPCTC_ROUTINE_EACH(__macro) \
+  __macro(get_warpctc_version);       \
+  __macro(ctcGetStatusString);        \
+  __macro(compute_ctc_loss);          \
+  __macro(get_workspace_size)
+
+WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP);
+
+#undef DYNAMIC_LOAD_WARPCTC_WRAP
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/enforce.cc b/paddle/fluid/platform/enforce.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55cd80943cf18545d3cac6f3ebddee1030d62b37
--- /dev/null
+++ b/paddle/fluid/platform/enforce.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
new file mode 100644
index 0000000000000000000000000000000000000000..b22893c0a56f39347894924b3cd6ea64180aa8b6
--- /dev/null
+++ b/paddle/fluid/platform/enforce.h
@@ -0,0 +1,258 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <dlfcn.h>     // for dladdr
+#include <execinfo.h>  // for backtrace
+#include <iomanip>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/string/printf.h"
+#include "paddle/string/to_string.h"
+
+#ifdef __GNUC__
+#include <cxxabi.h>  // for __cxa_demangle
+#endif
+
+#include <glog/logging.h>
+
+#ifdef PADDLE_WITH_CUDA
+
+#include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
+#include "paddle/fluid/platform/dynload/curand.h"
+#include "paddle/fluid/platform/dynload/nccl.h"
+
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <curand.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+
+#endif
+
+namespace paddle {
+namespace platform {
+
+#ifdef __GNUC__
+inline std::string demangle(std::string name) {
+  int status = -4;  // some arbitrary value to eliminate the compiler warning
+  std::unique_ptr<char, void (*)(void*)> res{
+      abi::__cxa_demangle(name.c_str(), NULL, NULL, &status), std::free};
+  return (status == 0) ? res.get() : name;
+}
+#else
+inline std::string demangle(std::string name) { return name; }
+#endif
+
+struct EnforceNotMet : public std::exception {
+  std::exception_ptr exp_;
+  std::string err_str_;
+  EnforceNotMet(std::exception_ptr e, const char* f, int l) : exp_(e) {
+    static constexpr int TRACE_STACK_LIMIT = 100;
+    try {
+      std::rethrow_exception(exp_);
+    } catch (const std::exception& exp) {
+      std::ostringstream sout;
+
+      sout << string::Sprintf("%s at [%s:%d]", exp.what(), f, l) << std::endl;
+      sout << "PaddlePaddle Call Stacks: " << std::endl;
+
+      void* call_stack[TRACE_STACK_LIMIT];
+      auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
+      auto symbols = backtrace_symbols(call_stack, size);
+
+      Dl_info info;
+      for (int i = 0; i < size; ++i) {
+        if (dladdr(call_stack[i], &info) && info.dli_sname) {
+          auto demangled = demangle(info.dli_sname);
+          auto addr_offset = static_cast<char*>(call_stack[i]) -
+                             static_cast<char*>(info.dli_saddr);
+          sout << string::Sprintf("%-3d %*0p %s + %zd\n", i,
+                                  2 + sizeof(void*) * 2, call_stack[i],
+                                  demangled, addr_offset);
+        } else {
+          sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
+                                  call_stack[i]);
+        }
+      }
+      free(symbols);
+      err_str_ = sout.str();
+    }
+  }
+
+  const char* what() const noexcept { return err_str_.c_str(); }
+};
+
+// Because most enforce conditions would evaluate to true, we can use
+// __builtin_expect to instruct the C++ compiler to generate code that
+// always forces branch prediction of true.
+// This generates faster binary code. __builtin_expect is since C++11.
+// For more details, please check https://stackoverflow.com/a/43870188/724872.
+#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    bool stat, const Args&... args) {
+  if (UNLIKELY(!(stat))) {
+    throw std::runtime_error(string::Sprintf(args...));
+  }
+}
+
+#ifdef PADDLE_WITH_CUDA
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    cudaError_t e, const Args&... args) {
+  if (UNLIKELY(e)) {
+    throw thrust::system_error(e, thrust::cuda_category(),
+                               string::Sprintf(args...));
+  }
+}
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    curandStatus_t stat, const Args&... args) {
+  if (stat != CURAND_STATUS_SUCCESS) {
+    throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
+                               string::Sprintf(args...));
+  }
+}
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    cudnnStatus_t stat, const Args&... args) {
+  if (stat == CUDNN_STATUS_SUCCESS) {
+    return;
+  } else {
+    throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) +
+                             string::Sprintf(args...));
+  }
+}
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    cublasStatus_t stat, const Args&... args) {
+  std::string err;
+  if (stat == CUBLAS_STATUS_SUCCESS) {
+    return;
+  } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
+    err = "CUBLAS: not initialized, ";
+  } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) {
+    err = "CUBLAS: alloc failed, ";
+  } else if (stat == CUBLAS_STATUS_INVALID_VALUE) {
+    err = "CUBLAS: invalid value, ";
+  } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) {
+    err = "CUBLAS: arch mismatch, ";
+  } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) {
+    err = "CUBLAS: mapping error, ";
+  } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) {
+    err = "CUBLAS: execution failed, ";
+  } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) {
+    err = "CUBLAS: internal error, ";
+  } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) {
+    err = "CUBLAS: not supported, ";
+  } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
+    err = "CUBLAS: license error, ";
+  }
+  throw std::runtime_error(err + string::Sprintf(args...));
+}
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    ncclResult_t stat, const Args&... args) {
+  if (stat == ncclSuccess) {
+    return;
+  } else {
+    throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) +
+                             string::Sprintf(args...));
+  }
+}
+
+#endif  // PADDLE_ONLY_CPU
+
+template <typename T>
+inline void throw_on_error(T e) {
+  throw_on_error(e, "");
+}
+
+#define PADDLE_THROW(...)                                              \
+  do {                                                                 \
+    throw ::paddle::platform::EnforceNotMet(                           \
+        std::make_exception_ptr(                                       \
+            std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \
+        __FILE__, __LINE__);                                           \
+  } while (false)
+
+#define PADDLE_ENFORCE(...)                                             \
+  do {                                                                  \
+    try {                                                               \
+      ::paddle::platform::throw_on_error(__VA_ARGS__);                  \
+    } catch (...) {                                                     \
+      throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
+                                              __FILE__, __LINE__);      \
+    }                                                                   \
+  } while (false)
+
+/*
+ * Some enforce helpers here, usage:
+ *    int a = 1;
+ *    int b = 2;
+ *    PADDLE_ENFORCE_EQ(a, b);
+ *
+ *    will raise an expression described as follows:
+ *    "enforce a == b failed, 1 != 2" with detailed stack information.
+ *
+ *    extra messages is also supported, for example:
+ *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
+ */
+
+#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
+#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__)
+#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__)
+#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__)
+#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
+#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
+#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                  \
+  do {                                                       \
+    if (UNLIKELY(nullptr == (__VAL))) {                      \
+      PADDLE_THROW(#__VAL " should not be null\n%s",         \
+                   paddle::string::Sprintf("" __VA_ARGS__)); \
+    }                                                        \
+  } while (0)
+
+#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)  \
+  do {                                                                  \
+    if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) {                           \
+      PADDLE_THROW("enforce %s " #__CMP " %s failed, %s " #__INV_CMP    \
+                   " %s\n%s",                                           \
+                   #__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \
+                   paddle::string::to_string(__VAL1),                   \
+                   paddle::string::Sprintf("" __VA_ARGS__));            \
+    }                                                                   \
+  } while (0)
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..896a9a04eca80f22bebf87859d277458d9bdb092
--- /dev/null
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -0,0 +1,216 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <array>
+#include <iostream>
+#include <memory>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/string/piece.h"
+
+using StringPiece = paddle::string::Piece;
+using paddle::string::HasPrefix;
+
+TEST(ENFORCE, OK) {
+  PADDLE_ENFORCE(true, "Enforce is ok %d now %f", 123, 0.345);
+  size_t val = 1;
+  const size_t limit = 10;
+  PADDLE_ENFORCE(val < limit, "Enforce is OK too");
+}
+
+TEST(ENFORCE, FAILED) {
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123);
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "Enforce is not ok 123 at all"));
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE, NO_ARG_OK) {
+  int a = 2;
+  int b = 2;
+  PADDLE_ENFORCE_EQ(a, b);
+  // test enforce with extra message.
+  PADDLE_ENFORCE_EQ(a, b, "some thing wrong %s", "info");
+}
+
+TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) {
+  int a = 2;
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE_EQ(a, 1 + 3);
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    HasPrefix(StringPiece(error.what()), "enforce a == 1 + 3 failed, 2 != 4");
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
+  int a = 2;
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE_EQ(a, 1 + 3, "%s size not match", "their");
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    HasPrefix(StringPiece(error.what()),
+              "enforce a == 1 + 3 failed, 2 != 4\ntheir size not match");
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_NE, OK) {
+  PADDLE_ENFORCE_NE(1, 2);
+  PADDLE_ENFORCE_NE(1.0, 2UL);
+}
+TEST(ENFORCE_NE, FAIL) {
+  bool caught_exception = false;
+
+  try {
+    // 2UL here to check data type compatible
+    PADDLE_ENFORCE_NE(1.0, 1UL);
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
+                          "enforce 1.0 != 1UL failed, 1 == 1"))
+        << error.what() << " does not have expected prefix";
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_GT, OK) { PADDLE_ENFORCE_GT(2, 1); }
+TEST(ENFORCE_GT, FAIL) {
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE_GT(1, 2UL);
+
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2"));
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_GE, OK) {
+  PADDLE_ENFORCE_GE(2, 2UL);
+  PADDLE_ENFORCE_GE(3, 2UL);
+  PADDLE_ENFORCE_GE(3, 2);
+  PADDLE_ENFORCE_GE(3.21, 2UL);
+}
+TEST(ENFORCE_GE, FAIL) {
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE_GE(1, 2UL);
+
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "enforce 1 >= 2UL failed, 1 < 2"));
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_LE, OK) {
+  PADDLE_ENFORCE_LE(1, 1);
+  PADDLE_ENFORCE_LE(1, 1UL);
+  PADDLE_ENFORCE_LE(2, 3UL);
+  PADDLE_ENFORCE_LE(2UL, 3);
+  PADDLE_ENFORCE_LE(2UL, 3.2);
+}
+TEST(ENFORCE_LE, FAIL) {
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE_GT(1, 2UL);
+
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2"));
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_LT, OK) {
+  PADDLE_ENFORCE_LT(3, 10);
+  PADDLE_ENFORCE_LT(2, 3UL);
+  PADDLE_ENFORCE_LT(2UL, 3);
+}
+TEST(ENFORCE_LT, FAIL) {
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE_LT(1UL, 0.12);
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
+                          "enforce 1UL < 0.12 failed, 1 >= 0.12"));
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_NOT_NULL, OK) {
+  int* a = new int;
+  PADDLE_ENFORCE_NOT_NULL(a);
+  delete a;
+}
+TEST(ENFORCE_NOT_NULL, FAIL) {
+  bool caught_exception = false;
+  try {
+    int* a = nullptr;
+    PADDLE_ENFORCE_NOT_NULL(a);
+
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "a should not be null"));
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+struct Dims {
+  size_t dims_[4];
+
+  bool operator==(const Dims& o) const {
+    for (size_t i = 0; i < 4; ++i) {
+      if (dims_[i] != o.dims_[i]) return false;
+    }
+    return true;
+  }
+};
+
+std::ostream& operator<<(std::ostream& os, const Dims& d) {
+  for (size_t i = 0; i < 4; ++i) {
+    if (i == 0) {
+      os << "[";
+    }
+    os << d.dims_[i];
+    if (i == 4 - 1) {
+      os << "]";
+    } else {
+      os << ", ";
+    }
+  }
+  return os;
+}
+
+TEST(ENFORCE_USER_DEFINED_CLASS, EQ) {
+  Dims a{{1, 2, 3, 4}}, b{{1, 2, 3, 4}};
+  PADDLE_ENFORCE_EQ(a, b);
+}
+
+TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
+  Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
+  ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet);
+}
diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e695328c394cd5cd1a14f42b7c82e8899e2167b
--- /dev/null
+++ b/paddle/fluid/platform/for_range.h
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace platform {
+
+template <typename DeviceContext>
+struct ForRange {
+  ForRange(const DeviceContext& dev_ctx, size_t limit);
+
+  template <typename Function>
+  void operator()(Function func) const;
+};
+
+template <>
+struct ForRange<CPUDeviceContext> {
+  ForRange(const CPUDeviceContext& dev_ctx, size_t limit) : limit_(limit) {}
+
+  template <typename Function>
+  void operator()(Function func) const {
+    for (size_t i = 0; i < limit_; ++i) {
+      func(i);
+    }
+  }
+
+  size_t limit_;
+};
+
+#ifdef __NVCC__
+template <typename Function>
+__global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
+  size_t idx = static_cast<size_t>(threadIdx.x);
+  func(idx);
+}
+
+template <typename Function>
+__global__ static void ForRangeElemwiseOp(Function func, int limit) {
+  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
+  if (idx < limit) {
+    func(idx);
+  }
+}
+
+template <>
+struct ForRange<CUDADeviceContext> {
+  ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
+      : dev_ctx_(dev_ctx), limit_(static_cast<int>(limit)) {}
+
+  template <typename Function>
+  inline void operator()(Function func) const {
+    constexpr int num_threads = 1024;
+    int block_size = limit_ <= num_threads ? limit_ : num_threads;
+    int grid_size = (limit_ + num_threads - 1) / num_threads;
+
+    if (grid_size == 1) {
+      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
+          func);
+    } else {
+      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
+          func, limit_);
+    }
+  }
+
+  const CUDADeviceContext& dev_ctx_;
+  int limit_;
+};
+
+#endif
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1797f59a9c9731d28febe32d268d2b07073550eb
--- /dev/null
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/gpu_info.h"
+
+#include "gflags/gflags.h"
+
+#include "paddle/fluid/platform/enforce.h"
+
+DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
+              "Default use 92% of GPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+
+namespace paddle {
+namespace platform {
+
+int GetCUDADeviceCount() {
+  int count;
+  PADDLE_ENFORCE(
+      cudaGetDeviceCount(&count),
+      "cudaGetDeviceCount failed in paddle::platform::GetCUDADeviceCount");
+  return count;
+}
+
+int GetCurrentDeviceId() {
+  int device_id;
+  PADDLE_ENFORCE(
+      cudaGetDevice(&device_id),
+      "cudaGetDevice failed in paddle::platform::GetCurrentDeviceId");
+  return device_id;
+}
+
+void SetDeviceId(int id) {
+  // TODO(qijun): find a better way to cache the cuda device count
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  PADDLE_ENFORCE(cudaSetDevice(id),
+                 "cudaSetDevice failed in paddle::platform::SetDeviceId");
+}
+
+void GpuMemoryUsage(size_t &available, size_t &total) {
+  PADDLE_ENFORCE(cudaMemGetInfo(&available, &total),
+                 "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
+}
+
+size_t GpuMaxAllocSize() {
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(available, total);
+
+  // Reserve the rest for page tables, etc.
+  return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
+}
+
+size_t GpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
+size_t GpuMaxChunkSize() {
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(available, total);
+  VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
+           << total / 1024 / 1024 << "M";
+  size_t reserving = static_cast<size_t>(0.05 * total);
+  // If available less than minimum chunk size, no usable memory exists.
+  available =
+      std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
+               total - reserving);
+
+  // Reserving the rest memory for page tables, etc.
+
+  size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
+                                          (total - reserving));
+
+  PADDLE_ENFORCE_LE(allocating, available);
+
+  return allocating;
+}
+
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum cudaMemcpyKind kind, cudaStream_t stream) {
+  PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),
+                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
+}
+
+void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
+                   size_t count, cudaStream_t stream) {
+  PADDLE_ENFORCE(
+      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream),
+      "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer");
+}
+
+void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
+  PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream),
+                 "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync");
+}
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
similarity index 100%
rename from paddle/platform/gpu_info.h
rename to paddle/fluid/platform/gpu_info.h
diff --git a/paddle/platform/hostdevice.h b/paddle/fluid/platform/hostdevice.h
similarity index 100%
rename from paddle/platform/hostdevice.h
rename to paddle/fluid/platform/hostdevice.h
diff --git a/paddle/platform/macros.h b/paddle/fluid/platform/macros.h
similarity index 100%
rename from paddle/platform/macros.h
rename to paddle/fluid/platform/macros.h
diff --git a/paddle/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
similarity index 100%
rename from paddle/platform/mkldnn_helper.h
rename to paddle/fluid/platform/mkldnn_helper.h
diff --git a/paddle/fluid/platform/nccl_test.cu b/paddle/fluid/platform/nccl_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..75b95aff1a41dac70f1b732938c648ca55b2a973
--- /dev/null
+++ b/paddle/fluid/platform/nccl_test.cu
@@ -0,0 +1,154 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/device_vector.h>
+#include <memory>
+#include <vector>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/dynload/nccl.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+static int dev_count = 0;
+
+namespace paddle {
+namespace platform {
+
+TEST(NCCL, init) {
+  std::vector<ncclComm_t> comms;
+  comms.resize(dev_count);
+  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
+
+  for (int i = 0; i < dev_count; ++i) {
+    dynload::ncclCommDestroy(comms[i]);
+  }
+}
+
+template <typename T>
+struct PerThreadData {
+  thrust::device_vector<T> send_buff;
+  thrust::device_vector<T> recv_buff;
+  CUDADeviceContext dev_ctx;
+
+  T* SendBuff() { return thrust::raw_pointer_cast(send_buff.data()); }
+
+  T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); }
+
+  PerThreadData(int gpu_id, size_t size) : dev_ctx(CUDAPlace(gpu_id)) {
+    send_buff.resize(size);
+    for (size_t i = 0; i < size; ++i) {
+      send_buff[i] = static_cast<T>(i);
+    }
+    recv_buff.resize(size);
+  }
+};
+
+static constexpr int ELEM_COUNT = 10000;
+
+TEST(NCCL, all_reduce) {
+  std::vector<ncclComm_t> comms;
+  comms.resize(dev_count);
+  VLOG(1) << "Initializing ncclComm";
+  dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
+  VLOG(1) << "ncclComm initialized";
+  VLOG(1) << "Creating thread data";
+  std::vector<std::unique_ptr<PerThreadData<double>>> data;
+  data.reserve(dev_count);
+  for (int i = 0; i < dev_count; ++i) {
+    VLOG(1) << "Creating thread data for device " << i;
+    SetDeviceId(i);
+    data.emplace_back(new PerThreadData<double>(i, ELEM_COUNT));
+  }
+  VLOG(1) << "Thread data created";
+
+  VLOG(1) << "Check send_buf data";
+  for (int i = 0; i < dev_count; ++i) {
+    VLOG(1) << "Check on device " << i;
+    SetDeviceId(i);
+    thrust::host_vector<double> tmp = data[i]->send_buff;
+    for (size_t j = 0; j < tmp.size(); ++j) {
+      ASSERT_NEAR(static_cast<double>(j), tmp[j], 1e-5);
+    }
+  }
+
+  VLOG(1) << "Invoking ncclAllReduce";
+
+  for (int i = 0; i < dev_count; ++i) {
+    VLOG(1) << "Invoking ncclAllReduce with device " << i;
+    SetDeviceId(i);
+    PADDLE_ENFORCE(dynload::ncclAllReduce(
+        data[i]->SendBuff(), data[i]->RecvBuff(), ELEM_COUNT, ncclDouble,
+        ncclSum, comms[i], data[i]->dev_ctx.stream()));
+    VLOG(1) << "Invoked ncclAllReduce for device " << i;
+  }
+
+  VLOG(1) << "Invoked ncclAllReduce";
+
+  VLOG(1) << "Sync devices";
+  for (int i = 0; i < dev_count; ++i) {
+    VLOG(1) << "Sync device " << i;
+    SetDeviceId(i);
+    data[i]->dev_ctx.Wait();
+  }
+  VLOG(1) << "device synced";
+
+  for (int i = 0; i < dev_count; ++i) {
+    SetDeviceId(i);
+    VLOG(1) << "Checking vector on device " << i;
+    thrust::host_vector<double> tmp = data[i]->recv_buff;
+    for (size_t j = 0; j < tmp.size(); ++j) {
+      auto elem = static_cast<double>(j);
+      elem *= dev_count;
+      ASSERT_NEAR(tmp[j], elem, 1e-4);
+    }
+  }
+
+  for (int i = 0; i < dev_count; ++i) {
+    dynload::ncclCommDestroy(comms[i]);
+  }
+}
+}  // namespace platform
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  // FIXME(tonyyang-svail):
+  //   Due to the driver issue on our CI, disable for now
+  return 0;
+  dev_count = paddle::platform::GetCUDADeviceCount();
+  if (dev_count <= 1) {
+    LOG(WARNING)
+        << "Cannot test multi-gpu nccl, because the CUDA device count is "
+        << dev_count;
+    return 0;
+  }
+
+  std::vector<paddle::platform::Place> places;
+
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::CUDAPlace(i));
+  }
+
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Init(places);
+
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e99b75d761abcf065070f463d578171797383cea
--- /dev/null
+++ b/paddle/fluid/platform/place.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace platform {
+
+namespace detail {
+
+class PlacePrinter : public boost::static_visitor<> {
+ public:
+  explicit PlacePrinter(std::ostream &os) : os_(os) {}
+  void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
+  void operator()(const CUDAPlace &p) {
+    os_ << "CUDAPlace(" << p.device << ")";
+  }
+
+ private:
+  std::ostream &os_;
+};
+
+}  // namespace detail
+
+static Place the_default_place;
+
+void set_place(const Place &place) { the_default_place = place; }
+const Place &get_place() { return the_default_place; }
+
+const CUDAPlace default_gpu() { return CUDAPlace(0); }
+const CPUPlace default_cpu() { return CPUPlace(); }
+
+bool is_gpu_place(const Place &p) {
+  return boost::apply_visitor(IsCUDAPlace(), p);
+}
+
+bool is_cpu_place(const Place &p) { return !is_gpu_place(p); }
+
+bool places_are_same_class(const Place &p1, const Place &p2) {
+  return p1.which() == p2.which();
+}
+
+bool is_same_place(const Place &p1, const Place &p2) {
+  if (places_are_same_class(p1, p2)) {
+    if (is_cpu_place(p1)) {
+      return true;
+    } else {
+      return boost::get<CUDAPlace>(p1) == boost::get<CUDAPlace>(p2);
+    }
+  } else {
+    return false;
+  }
+}
+
+std::ostream &operator<<(std::ostream &os, const Place &p) {
+  detail::PlacePrinter printer(os);
+  boost::apply_visitor(printer, p);
+  return os;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
new file mode 100644
index 0000000000000000000000000000000000000000..2977a41036e84fc5aabd69c24cc9e62391b7dc38
--- /dev/null
+++ b/paddle/fluid/platform/place.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <iostream>
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace platform {
+
+struct CPUPlace {
+  // WORKAROUND: for some reason, omitting this constructor
+  // causes errors with boost 1.59 and OSX
+  CPUPlace() {}
+
+  // needed for variant equality comparison
+  inline bool operator==(const CPUPlace &) const { return true; }
+  inline bool operator!=(const CPUPlace &) const { return false; }
+};
+
+struct CUDAPlace {
+  CUDAPlace() : CUDAPlace(0) {}
+  explicit CUDAPlace(int d) : device(d) {}
+
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const CUDAPlace &o) const {
+    return device == o.device;
+  }
+  inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
+
+  int device;
+};
+
+struct IsCUDAPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const CUDAPlace &gpu) const { return true; }
+};
+
+typedef boost::variant<CUDAPlace, CPUPlace> Place;
+
+using PlaceList = std::vector<Place>;
+
+void set_place(const Place &);
+const Place &get_place();
+
+const CUDAPlace default_gpu();
+const CPUPlace default_cpu();
+
+bool is_gpu_place(const Place &);
+bool is_cpu_place(const Place &);
+bool places_are_same_class(const Place &, const Place &);
+bool is_same_place(const Place &, const Place &);
+
+std::ostream &operator<<(std::ostream &, const Place &);
+
+template <typename Visitor>
+struct PlaceVisitorWrapper
+    : public boost::static_visitor<typename Visitor::result_type> {
+  const Visitor &visitor_;
+  explicit PlaceVisitorWrapper(const Visitor &visitor) : visitor_(visitor) {}
+
+  typename Visitor::result_type operator()(const CPUPlace &cpu) const {
+    return visitor_(cpu);
+  }
+
+  typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
+#ifdef PADDLE_WITH_CUDA
+    return visitor_(cuda);
+#else
+    PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda device");
+    return typename Visitor::result_type();
+#endif
+  }
+};
+
+template <typename Visitor>
+typename Visitor::result_type VisitPlace(const Place &place,
+                                         const Visitor &visitor) {
+  return boost::apply_visitor(PlaceVisitorWrapper<Visitor>(visitor), place);
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/place_test.cc b/paddle/fluid/platform/place_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f248902d91c1ffad1364a2f1078a41626b61ac22
--- /dev/null
+++ b/paddle/fluid/platform/place_test.cc
@@ -0,0 +1,54 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/platform/place.h"
+#include <sstream>
+#include "gtest/gtest.h"
+
+TEST(Place, Equality) {
+  paddle::platform::CPUPlace cpu;
+  paddle::platform::CUDAPlace g0(0), g1(1), gg0(0);
+
+  EXPECT_EQ(cpu, cpu);
+  EXPECT_EQ(g0, g0);
+  EXPECT_EQ(g1, g1);
+  EXPECT_EQ(g0, gg0);
+
+  EXPECT_NE(g0, g1);
+
+  EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0));
+  EXPECT_FALSE(paddle::platform::places_are_same_class(g0, cpu));
+}
+
+TEST(Place, Default) {
+  EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::get_place()));
+  EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::default_gpu()));
+  EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::default_cpu()));
+
+  EXPECT_FALSE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
+  paddle::platform::set_place(paddle::platform::CPUPlace());
+  EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
+}
+
+TEST(Place, Print) {
+  {
+    std::stringstream ss;
+    ss << paddle::platform::CUDAPlace(1);
+    EXPECT_EQ("CUDAPlace(1)", ss.str());
+  }
+  {
+    std::stringstream ss;
+    ss << paddle::platform::CPUPlace();
+    EXPECT_EQ("CPUPlace", ss.str());
+  }
+}
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..28d2675f799a2d398d43dc31c550f0d84424116e
--- /dev/null
+++ b/paddle/fluid/platform/profiler.cc
@@ -0,0 +1,346 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/profiler.h"
+#include <iomanip>
+#include <map>
+#include "glog/logging.h"
+
+namespace paddle {
+namespace platform {
+
+// The profiler state, the initial value is ProfilerState::kDisabled
+static ProfilerState g_state = ProfilerState::kDisabled;
+// To record which timer the profiler used, CUDA or CPU.
+static std::string g_profiler_place = "";
+// The thread local event list only can be accessed by the specific thread
+// The thread index of each thread
+static thread_local int32_t g_thread_id;
+// The g_next_thread_id is a global counter for threads, by the g_thread_id and
+// g_next_thread_id, we can know how many threads have created EventList.
+static uint32_t g_next_thread_id = 0;
+// The global mutex
+static std::mutex g_all_event_lists_mutex;
+// The total event lists of all threads
+static std::list<std::shared_ptr<EventList>> g_all_event_lists;
+// The thread local event list only can be accessed by the specific thread
+static thread_local std::shared_ptr<EventList> g_event_list;
+
+inline uint64_t GetTimeInNsec() {
+  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
+                                 std::chrono::high_resolution_clock,
+                                 std::chrono::steady_clock>::type;
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(
+             clock::now().time_since_epoch())
+      .count();
+}
+
+Event::Event(EventKind kind, std::string name, uint32_t thread_id,
+             const DeviceContext* dev_ctx)
+    : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
+#ifdef PADDLE_WITH_CUDA
+  has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
+  if (has_cuda_) {
+    auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
+    PADDLE_ENFORCE(cudaGetDevice(&device_));
+    PADDLE_ENFORCE(cudaEventCreate(&event_));
+    auto stream = cuda_dev_ctx->stream();
+    PADDLE_ENFORCE(cudaEventRecord(event_, stream));
+  }
+#endif
+  cpu_ns_ = GetTimeInNsec();
+}
+
+std::string Event::kind() const {
+  switch (kind_) {
+    case EventKind::kMark:
+      return "mark";
+    case EventKind::kPushRange:
+      return "push";
+    case EventKind::kPopRange:
+      return "pop";
+  }
+  PADDLE_THROW("Unknown EventKind.");
+}
+
+double Event::CpuElapsedMs(const Event& e) const {
+  return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
+}
+
+double Event::CudaElapsedMs(const Event& e) const {
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE(e.has_cuda() && has_cuda());
+  PADDLE_ENFORCE(e.device() == device());
+  PADDLE_ENFORCE(cudaEventSynchronize(event_));
+  PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
+  float ms;
+  PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
+  return ms;
+#else
+  PADDLE_THROW("CUDA is not enabled");
+#endif
+}
+
+#ifdef PADDLE_WITH_CUDA
+static void ForEachDevice(std::function<void(int)> func) {
+  auto original_device = GetCurrentDeviceId();
+  int count = GetCUDADeviceCount();
+  for (int i = 0; i < count; i++) {
+    SetDeviceId(i);
+    func(i);
+  }
+  SetDeviceId(original_device);
+}
+#endif
+
+inline EventList& GetEventList() {
+  if (!g_event_list) {
+    std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+    g_event_list = std::make_shared<EventList>();
+    g_thread_id = g_next_thread_id++;
+    g_all_event_lists.emplace_front(g_event_list);
+  }
+  return *g_event_list;
+}
+
+void Mark(const std::string& name, const DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
+}
+
+void PushEvent(const std::string& name, const DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
+}
+
+void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
+}
+
+RecordEvent::RecordEvent(const std::string& name,
+                         const DeviceContext* dev_ctx) {
+  if (g_state == ProfilerState::kDisabled) return;
+  dev_ctx_ = dev_ctx;
+  name_ = name;
+  PushEvent(name_, dev_ctx_);
+}
+
+RecordEvent::~RecordEvent() {
+  if (g_state == ProfilerState::kDisabled) return;
+  PopEvent(name_, dev_ctx_);
+}
+
+void EnableProfiler(ProfilerState state) {
+  PADDLE_ENFORCE(state != ProfilerState::kDisabled,
+                 "Can't enbale profling, since the input state is ",
+                 "ProfilerState::kDisabled");
+  PADDLE_ENFORCE(g_state == ProfilerState::kDisabled,
+                 "The profiling state should be disabled when calling ",
+                 "EnableProfiler.");
+  g_state = state;
+  g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU";
+#ifdef PADDLE_WITH_CUDA
+  if (g_state == ProfilerState::kCUDA) {
+    // Generate some dummy evenets first to reduce the startup overhead.
+    for (int i = 0; i < 5; i++) {
+      ForEachDevice([](int d) {
+        DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
+        Mark("_cuda_startup_", dev_ctx);
+        dev_ctx->Wait();
+        delete dev_ctx;
+      });
+    }
+  }
+#endif
+  // Mark the profiling start.
+  Mark("_start_profiler_", nullptr);
+}
+
+void ResetProfiler() {
+  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
+       ++it) {
+    (*it)->Clear();
+  }
+}
+
+std::vector<std::vector<Event>> GetAllEvents() {
+  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+  std::vector<std::vector<Event>> result;
+  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
+       ++it) {
+    result.emplace_back((*it)->Reduce());
+  }
+  return result;
+}
+
+void DisableProfiler(EventSortingKey sorted_key) {
+  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
+                 "Can't disable profiling, since it's not starting.");
+  // Mark the profiling stop.
+  Mark("_stop_profiler_", nullptr);
+  g_state = ProfilerState::kDisabled;
+
+  std::vector<std::vector<Event>> all_events = GetAllEvents();
+  ParseEvents(all_events, sorted_key);
+  ResetProfiler();
+}
+
+void ParseEvents(std::vector<std::vector<Event>>& events,
+                 EventSortingKey sorted_by) {
+  if (g_profiler_place == "") return;
+
+  std::string sorted_domain;
+  std::function<bool(const EventItem&, const EventItem&)> sorted_func;
+  switch (sorted_by) {
+    case EventSortingKey::kCalls:
+      sorted_domain = "number of calls";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.calls > b.calls;
+      };
+      break;
+    case EventSortingKey::kTotal:
+      sorted_domain = "total time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.total_time > b.total_time;
+      };
+      break;
+    case EventSortingKey::kMin:
+      sorted_domain = "minimum time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.min_time > b.min_time;
+      };
+      break;
+    case EventSortingKey::kMax:
+      sorted_domain = "maximum time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.max_time > b.max_time;
+      };
+      break;
+    case EventSortingKey::kAve:
+      sorted_domain = "average time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.ave_time > b.ave_time;
+      };
+      break;
+    default:
+      sorted_domain = "event first end time";
+  }
+
+  std::vector<std::vector<EventItem>> events_table;
+  size_t max_name_width = 0;
+  for (size_t i = 0; i < events.size(); i++) {
+    std::list<Event> pushed_events;
+    std::vector<EventItem> event_items;
+    std::unordered_map<std::string, int> event_idx;
+
+    for (size_t j = 0; j < events[i].size(); j++) {
+      if (events[i][j].kind() == "push") {
+        pushed_events.push_back(events[i][j]);
+      } else if (events[i][j].kind() == "pop") {
+        std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
+        while (rit != pushed_events.rend() &&
+               rit->name() != events[i][j].name()) {
+          ++rit;
+        }
+
+        if (rit != pushed_events.rend()) {
+          double event_time = (g_profiler_place == "CUDA")
+                                  ? rit->CudaElapsedMs(events[i][j])
+                                  : rit->CpuElapsedMs(events[i][j]);
+          std::string event_name =
+              "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
+          max_name_width = std::max(max_name_width, event_name.size());
+
+          if (event_idx.find(event_name) == event_idx.end()) {
+            event_idx[event_name] = event_items.size();
+            EventItem event_item = {event_name, 1,          event_time,
+                                    event_time, event_time, event_time};
+            event_items.push_back(event_item);
+          } else {
+            int index = event_idx[event_name];
+            event_items[index].calls += 1;
+            // total time
+            event_items[index].total_time += event_time;
+            // min time
+            event_items[index].min_time =
+                std::min(event_time, event_items[index].min_time);
+            // max time
+            event_items[index].max_time =
+                std::max(event_time, event_items[index].max_time);
+          }
+
+          // remove the push marker from the list
+          pushed_events.erase((++rit).base());
+        } else {
+          LOG(WARNING) << "Cannot find the push marker of event \'"
+                       << events[i][j].name()
+                       << "\', which will be ignored in profiling report.";
+        }
+      }
+    }
+    // average time
+    for (auto& item : event_items) {
+      item.ave_time = item.total_time / item.calls;
+    }
+    // sort
+    if (sorted_by != EventSortingKey::kDefault) {
+      std::sort(event_items.begin(), event_items.end(), sorted_func);
+    }
+
+    events_table.push_back(event_items);
+    // log warning if there are events with `push` but without `pop`
+    std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
+    while (rit != pushed_events.rend()) {
+      LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name()
+                   << "\', which will be ignored in profiling report.";
+      ++rit;
+    }
+  }
+
+  // Print report
+  PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12);
+}
+
+void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
+                   std::string& sorted_domain, const size_t name_width,
+                   const size_t data_width) {
+  // Output header information
+  std::cout << "\n------------------------->"
+            << "     Profiling Report     "
+            << "<-------------------------\n\n";
+  std::cout << "Place: " << g_profiler_place << std::endl;
+  std::cout << "Time unit: ms" << std::endl;
+  std::cout << "Sorted by " << sorted_domain
+            << " in descending order in the same thread\n\n";
+  // Output events table
+  std::cout.setf(std::ios::left);
+  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
+            << "Calls" << std::setw(data_width) << "Total"
+            << std::setw(data_width) << "Min." << std::setw(data_width)
+            << "Max." << std::setw(data_width) << "Ave." << std::endl;
+  for (size_t i = 0; i < events_table.size(); ++i) {
+    for (size_t j = 0; j < events_table[i].size(); ++j) {
+      EventItem& event_item = events_table[i][j];
+      std::cout << std::setw(name_width) << event_item.name
+                << std::setw(data_width) << event_item.calls
+                << std::setw(data_width) << event_item.total_time
+                << std::setw(data_width) << event_item.min_time
+                << std::setw(data_width) << event_item.max_time
+                << std::setw(data_width) << event_item.ave_time << std::endl;
+    }
+  }
+  std::cout << std::endl;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..0bc5e666cb4f28b99169435cb3dd52829c35a2c2
--- /dev/null
+++ b/paddle/fluid/platform/profiler.h
@@ -0,0 +1,150 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <forward_list>
+#include <list>
+#include <mutex>
+#include <vector>
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace platform {
+
+enum EventKind { kMark, kPushRange, kPopRange };
+
+class Event {
+ public:
+  // The DeviceContext is used to get the cuda stream.
+  // If CPU profiling mode, can pass nullptr.
+  Event(EventKind kind, std::string name, uint32_t thread_id,
+        const DeviceContext* dev_ctx);
+
+  std::string kind() const;
+  std::string name() const { return name_; }
+  uint32_t thread_id() const { return thread_id_; }
+  bool has_cuda() const { return has_cuda_; }
+
+#ifdef PADDLE_WITH_CUDA
+  cudaEvent_t event() const { return event_; }
+  int device() const { return device_; }
+#endif
+
+  double CpuElapsedMs(const Event& e) const;
+  double CudaElapsedMs(const Event& e) const;
+
+ private:
+  EventKind kind_;
+  std::string name_;
+  uint32_t thread_id_;
+  int64_t cpu_ns_;
+  bool has_cuda_;
+#ifdef PADDLE_WITH_CUDA
+  cudaEvent_t event_ = nullptr;
+  int device_ = -1;
+#endif
+};
+
+struct EventList {
+  constexpr static size_t kMB = 1024 * 1024;
+  constexpr static size_t kEventBlockSize = 16 * kMB;
+  constexpr static size_t kEventSize = sizeof(Event);
+  constexpr static size_t kEventAlign = alignof(Event);
+  constexpr static size_t kNumBlock =
+      kEventBlockSize /
+      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
+
+  template <typename... Args>
+  void Record(Args&&... args) {
+    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
+      event_blocks.emplace_front();
+      event_blocks.front().reserve(kNumBlock);
+    }
+    event_blocks.front().emplace_back(std::forward<Args>(args)...);
+  }
+
+  std::vector<Event> Reduce() {
+    std::vector<Event> result;
+    for (auto& block : event_blocks) {
+      result.insert(result.begin(), std::make_move_iterator(block.begin()),
+                    std::make_move_iterator(block.end()));
+    }
+    event_blocks.clear();
+    return result;
+  }
+
+  void Clear() { event_blocks.clear(); }
+
+  std::forward_list<std::vector<Event>> event_blocks;
+};
+
+enum ProfilerState {
+  kDisabled,  // disabled state
+  kCPU,       // CPU profiling state
+  kCUDA,      // GPU profiling state
+};
+
+void Mark(const std::string& name, const DeviceContext* dev_ctx);
+
+void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
+
+void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
+
+struct RecordEvent {
+  explicit RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
+
+  ~RecordEvent();
+
+  // The device context is used by Event to get the current cuda stream.
+  const DeviceContext* dev_ctx_;
+  // Event name
+  std::string name_;
+};
+
+// Return the event list of all threads. Asummed the returned value calls
+// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
+std::vector<std::vector<Event>> GetAllEvents();
+
+// The information of each event given in the profiling report
+struct EventItem {
+  std::string name;
+  int calls;
+  double total_time;
+  double min_time;
+  double max_time;
+  double ave_time;
+};
+
+// Candidate keys to sort the profiling report
+enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
+
+// Enable the profiling function.
+void EnableProfiler(ProfilerState state);
+
+// Clear the g_all_event_lists, which is total event lists of all threads.
+void ResetProfiler();
+
+void DisableProfiler(EventSortingKey sorted_key);
+
+// Parse the event list and output the profiling report
+void ParseEvents(std::vector<std::vector<Event>>&,
+                 EventSortingKey sorted_by = EventSortingKey::kDefault);
+
+// Print results
+void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
+                   std::string& sorted_domain, const size_t name_width,
+                   const size_t data_width);
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d2525c38b6fb98ea9bac49f7eb28e755bd7fa9a2
--- /dev/null
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -0,0 +1,129 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/profiler.h"
+#include "gtest/gtest.h"
+
+TEST(Event, CpuElapsedTime) {
+  using paddle::platform::Event;
+  using paddle::platform::EventKind;
+
+  Event start_event(EventKind::kPushRange, "test", 0, nullptr);
+  EXPECT_TRUE(start_event.has_cuda() == false);
+  int counter = 0;
+  while (counter != 1000) {
+    counter++;
+  }
+  Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
+  EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(Event, CudaElapsedTime) {
+  using paddle::platform::DeviceContext;
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::CUDAPlace;
+  using paddle::platform::Event;
+  using paddle::platform::EventKind;
+
+  DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
+  Event start_event(EventKind::kPushRange, "test", 0, dev_ctx);
+  EXPECT_TRUE(start_event.has_cuda() == true);
+  int counter = 0;
+  while (counter != 1000) {
+    counter++;
+  }
+  Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx);
+  EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
+}
+#endif
+
+TEST(RecordEvent, RecordEvent) {
+  using paddle::platform::DeviceContext;
+  using paddle::platform::Event;
+  using paddle::platform::EventKind;
+  using paddle::platform::RecordEvent;
+  using paddle::platform::ProfilerState;
+  using paddle::platform::EventSortingKey;
+
+  ProfilerState state = ProfilerState::kCPU;
+  DeviceContext* dev_ctx = nullptr;
+#ifdef PADDLE_WITH_CUDA
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::CUDAPlace;
+  state = ProfilerState::kCUDA;
+  dev_ctx =
+      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0));
+#endif
+  EnableProfiler(state);
+
+  /* Usage 1:
+  *  PushEvent(evt_name, dev_ctx);
+  *  ...
+  *  code to be analyzed
+  *  ...
+  * PopEvent(evt_name, dev_ctx);
+  */
+  for (int loop = 0; loop < 3; ++loop) {
+    for (int i = 1; i < 5; ++i) {
+      std::string name = "op_" + std::to_string(i);
+      PushEvent(name, dev_ctx);
+      int counter = 1;
+      while (counter != i * 1000) counter++;
+      PopEvent(name, dev_ctx);
+    }
+  }
+
+  /* Usage 2:
+   * {
+   *   RecordEvent record_event(name, dev_ctx);
+   *   ...
+   *   code to be analyzed
+   *   ...
+   * }
+   */
+  for (int i = 1; i < 5; ++i) {
+    std::string name = "evs_op_" + std::to_string(i);
+    RecordEvent record_event(name, dev_ctx);
+    int counter = 1;
+    while (counter != i * 1000) counter++;
+  }
+
+  // Bad Usage:
+  PushEvent("event_without_pop", dev_ctx);
+  PopEvent("event_without_push", dev_ctx);
+  std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();
+
+  int cuda_startup_count = 0;
+  int start_profiler_count = 0;
+  for (size_t i = 0; i < events.size(); ++i) {
+    for (size_t j = 0; j < events[i].size(); ++j) {
+      if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count;
+      if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
+      if (events[i][j].name() == "push") {
+        EXPECT_EQ(events[i][j + 1].name(), "pop");
+#ifdef PADDLE_WITH_CUDA
+        EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0);
+#else
+        EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0);
+#endif
+      }
+    }
+  }
+  EXPECT_EQ(cuda_startup_count % 5, 0);
+  EXPECT_EQ(start_profiler_count, 1);
+
+  // Will remove parsing-related code from test later
+  DisableProfiler(EventSortingKey::kTotal);
+}
diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..879daed19102c85cc5ea03933f8324023cec0fe2
--- /dev/null
+++ b/paddle/fluid/platform/transform.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/platform/place.h"
+
+#include <algorithm>
+#include <type_traits>
+#ifdef __NVCC__
+#include <thrust/execution_policy.h>
+#include <thrust/transform.h>
+#include "paddle/fluid/platform/details/device_ptr_cast.h"
+#endif
+
+namespace paddle {
+namespace platform {
+
+// Transform on host or device. It provides the same API in std library.
+template <typename DeviceContext>
+struct Transform {
+  template <typename InputIter, typename OutputIter, typename UnaryOperation>
+  void operator()(const DeviceContext& context, InputIter first, InputIter last,
+                  OutputIter result, UnaryOperation op);
+
+  template <typename InputIter1, typename InputIter2, typename OutputIter,
+            typename BinaryOperation>
+  void operator()(const DeviceContext& context, InputIter1 first1,
+                  InputIter1 last1, InputIter2 first2, OutputIter result,
+                  BinaryOperation op);
+};
+
+template <>
+struct Transform<platform::CPUDeviceContext> {
+  template <typename InputIter, typename OutputIter, typename UnaryOperation>
+  void operator()(const platform::CPUDeviceContext& context, InputIter first,
+                  InputIter last, OutputIter result, UnaryOperation op) {
+    std::transform(first, last, result, op);
+  }
+
+  template <typename InputIter1, typename InputIter2, typename OutputIter,
+            typename BinaryOperation>
+  void operator()(const platform::CPUDeviceContext& context, InputIter1 first1,
+                  InputIter1 last1, InputIter2 first2, OutputIter result,
+                  BinaryOperation op) {
+    std::transform(first1, last1, first2, result, op);
+  }
+};
+
+#ifdef __NVCC__
+template <>
+struct Transform<platform::CUDADeviceContext> {
+  template <typename InputIter, typename OutputIter, typename UnaryOperation>
+  void operator()(const platform::CUDADeviceContext& context, InputIter first,
+                  InputIter last, OutputIter result, UnaryOperation op) {
+    auto place = context.GetPlace();
+    PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
+    thrust::transform(thrust::cuda::par.on(context.stream()),
+                      details::DevPtrCast(first), details::DevPtrCast(last),
+                      details::DevPtrCast(result), op);
+  }
+
+  template <typename InputIter1, typename InputIter2, typename OutputIter,
+            typename BinaryOperation>
+  void operator()(const platform::CUDADeviceContext& context, InputIter1 first1,
+                  InputIter1 last1, InputIter2 first2, OutputIter result,
+                  BinaryOperation op) {
+    auto place = context.GetPlace();
+    PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
+    thrust::transform(thrust::cuda::par.on(context.stream()),
+                      details::DevPtrCast(first1), details::DevPtrCast(last1),
+                      details::DevPtrCast(first2), details::DevPtrCast(result),
+                      op);
+  }
+};
+#endif
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0e4b9edc2fd45e9c00f5339948172f6267210363
--- /dev/null
+++ b/paddle/fluid/platform/transform_test.cu
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/platform/transform.h"
+
+template <typename T>
+class Scale {
+ public:
+  explicit Scale(const T& scale) : scale_(scale) {}
+
+  HOSTDEVICE T operator()(const T& a) const { return a * scale_; }
+
+ private:
+  T scale_;
+};
+
+template <typename T>
+class Multiply {
+ public:
+  HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
+};
+
+TEST(Transform, CPUUnary) {
+  using namespace paddle::platform;
+  CPUDeviceContext ctx;
+  float buf[4] = {0.1, 0.2, 0.3, 0.4};
+  Transform<paddle::platform::CPUDeviceContext> trans;
+  trans(ctx, buf, buf + 4, buf, Scale<float>(10));
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
+  }
+}
+
+TEST(Transform, GPUUnary) {
+  using namespace paddle::platform;
+  using namespace paddle::memory;
+  CUDAPlace gpu0(0);
+  CUDADeviceContext ctx(gpu0);
+  float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
+  float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
+  Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
+  Transform<paddle::platform::CUDADeviceContext> trans;
+  trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
+  ctx.Wait();
+  Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
+  Free(gpu0, gpu_buf);
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
+  }
+}
+
+TEST(Transform, CPUBinary) {
+  using namespace paddle::platform;
+  using namespace paddle::memory;
+  int buf[4] = {1, 2, 3, 4};
+  Transform<paddle::platform::CPUDeviceContext> trans;
+  CPUDeviceContext ctx;
+  trans(ctx, buf, buf + 4, buf, buf, Multiply<int>());
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_EQ((i + 1) * (i + 1), buf[i]);
+  }
+}
+
+TEST(Transform, GPUBinary) {
+  using namespace paddle::platform;
+  using namespace paddle::memory;
+  int buf[4] = {1, 2, 3, 4};
+  CUDAPlace gpu0(0);
+  CUDADeviceContext ctx(gpu0);
+  int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
+  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
+  Transform<paddle::platform::CUDADeviceContext> trans;
+  trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
+  ctx.Wait();
+  Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());
+  Free(gpu0, gpu_buf);
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_EQ((i + 1) * (i + 1), buf[i]);
+  }
+}
diff --git a/paddle/platform/variant.h b/paddle/fluid/platform/variant.h
similarity index 100%
rename from paddle/platform/variant.h
rename to paddle/fluid/platform/variant.h
diff --git a/paddle/fluid/pybind/.clang-format b/paddle/fluid/pybind/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..29282dc87e2c499988c17d90d47d44cd5cf7f115
--- /dev/null
+++ b/paddle/fluid/pybind/.clang-format
@@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11 
+...
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d62f34030894e2fa21925bbc44e24b4e7d738d15
--- /dev/null
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -0,0 +1,9 @@
+if(WITH_PYTHON)
+  cc_library(paddle_pybind SHARED
+    SRCS pybind.cc exception.cc protobuf.cc const_value.cc
+    DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
+    ${GLOB_OP_LIB})
+  if(NOT APPLE AND NOT ANDROID)
+    target_link_libraries(paddle_pybind rt)
+  endif(NOT APPLE AND NOT ANDROID)
+endif(WITH_PYTHON)
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
new file mode 100644
index 0000000000000000000000000000000000000000..098252a83d3b7e2926bf737ce7f2b3794046f28f
--- /dev/null
+++ b/paddle/fluid/pybind/const_value.cc
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "const_value.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace pybind {
+
+void BindConstValue(pybind11::module& m) {
+  m.def("kEmptyVarName", [] { return framework::kEmptyVarName; });
+  m.def("kTempVarName", [] { return framework::kTempVarName; });
+  m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
+  m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/const_value.h b/paddle/fluid/pybind/const_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..67d14ac9ff01d1754dd8dd165b638db12c9d0ea0
--- /dev/null
+++ b/paddle/fluid/pybind/const_value.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <Python.h>
+#include "paddle/fluid/platform/enforce.h"
+#include "pybind11/pybind11.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+extern void BindConstValue(pybind11::module& m);
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7398a88541bcbf338ca9568595d0dc7b16eff118
--- /dev/null
+++ b/paddle/fluid/pybind/exception.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/pybind/exception.h"
+
+namespace paddle {
+namespace pybind {
+
+void BindException(pybind11::module& m) {
+  static pybind11::exception<platform::EnforceNotMet> exc(m, "EnforceNotMet");
+  pybind11::register_exception_translator([](std::exception_ptr p) {
+    try {
+      if (p) std::rethrow_exception(p);
+    } catch (const platform::EnforceNotMet& e) {
+      exc(e.what());
+    }
+  });
+
+  m.def("__unittest_throw_exception__", [] { PADDLE_THROW("test exception"); });
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/exception.h b/paddle/fluid/pybind/exception.h
new file mode 100644
index 0000000000000000000000000000000000000000..43e91a706300e561a20d88abe80d9b6654bd2171
--- /dev/null
+++ b/paddle/fluid/pybind/exception.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <Python.h>
+#include "paddle/fluid/platform/enforce.h"
+#include "pybind11/pybind11.h"
+namespace paddle {
+namespace pybind {
+
+extern void BindException(pybind11::module& m);
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b5f1a61d8bcdf8a0fac375fcfeb4f471a5bfba9c
--- /dev/null
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -0,0 +1,298 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/pybind/protobuf.h"
+#include <deque>
+#include <iostream>
+#include "paddle/fluid/framework/backward.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/var_desc.h"
+
+// Cast boost::variant for PyBind.
+// Copy from
+// https://github.com/pybind/pybind11/issues/576#issuecomment-269563199
+namespace pybind11 {
+namespace detail {
+
+// Can be replaced by a generic lambda in C++14
+struct variant_caster_visitor : public boost::static_visitor<handle> {
+  return_value_policy policy;
+  handle parent;
+
+  variant_caster_visitor(return_value_policy policy, handle parent)
+      : policy(policy), parent(parent) {}
+
+  template <class T>
+  handle operator()(T const &src) const {
+    return make_caster<T>::cast(src, policy, parent);
+  }
+};
+
+template <class Variant>
+struct variant_caster;
+
+template <template <class...> class V, class... Ts>
+struct variant_caster<V<Ts...>> {
+  using Type = V<Ts...>;
+
+  template <typename T>
+  typename std::enable_if<
+      !std::is_same<T, boost::detail::variant::void_>::value, bool>::type
+  try_load(handle src, bool convert) {
+    auto caster = make_caster<T>();
+    if (!load_success_ && caster.load(src, convert)) {
+      load_success_ = true;
+      value = cast_op<T>(caster);
+      return true;
+    }
+    return false;
+  }
+
+  template <typename T>
+  typename std::enable_if<std::is_same<T, boost::detail::variant::void_>::value,
+                          bool>::type
+  try_load(handle src, bool convert) {
+    return false;
+  }
+
+  bool load(handle src, bool convert) {
+    auto unused = {false, try_load<Ts>(src, convert)...};
+    (void)(unused);
+    return load_success_;
+  }
+
+  static handle cast(Type const &src, return_value_policy policy,
+                     handle parent) {
+    variant_caster_visitor visitor(policy, parent);
+    return boost::apply_visitor(visitor, src);
+  }
+
+  PYBIND11_TYPE_CASTER(Type, _("Variant"));
+  bool load_success_{false};
+};
+
+// Add specialization for concrete variant type
+template <class... Args>
+struct type_caster<boost::variant<Args...>>
+    : variant_caster<boost::variant<Args...>> {};
+
+}  // namespace detail
+}  // namespace pybind11
+
+namespace paddle {
+namespace pybind {
+
+using namespace paddle::framework;  // NOLINT
+
+template <typename T>
+static py::bytes SerializeMessage(T &self) {
+  // Check IsInitialized in Python
+  std::string retv;
+  PADDLE_ENFORCE(self.Proto()->SerializePartialToString(&retv),
+                 "Cannot serialize message");
+  return retv;
+}
+
+// Bind Methods
+void BindProgramDesc(py::module &m) {
+  py::class_<ProgramDesc>(m, "ProgramDesc", "")
+      .def(py::init<>())
+      .def("__init__",
+           [](ProgramDesc &self, const ProgramDesc &other) {
+             new (&self) ProgramDesc(other);
+           })
+      .def("__init__",
+           [](ProgramDesc &self, const py::bytes &binary_str) {
+             std::string str(binary_str);
+             new (&self) ProgramDesc(str);
+           })
+      .def("append_block", &ProgramDesc::AppendBlock,
+           py::return_value_policy::reference)
+      .def("append_backward",
+           [](ProgramDesc &program_desc, const VarDesc &target,
+              const std::unordered_set<std::string> &no_grad_vars) {
+             ParamGradInfoMap param_grad_map =
+                 AppendBackward(program_desc, target, no_grad_vars);
+             std::unordered_map<
+                 std::string, std::tuple<std::string /* grad_var_name */,
+                                         int /* block_idx */, int /* op_idx */>>
+                 retv;
+             for (auto it = param_grad_map.begin(); it != param_grad_map.end();
+                  ++it) {
+               const auto &grad_info = it->second;
+               retv[it->first] = std::make_tuple(
+                   grad_info.name_, grad_info.block_idx_, grad_info.op_idx_);
+             }
+             return retv;
+           })
+      .def("block", &ProgramDesc::MutableBlock,
+           py::return_value_policy::reference)
+      .def("num_blocks", &ProgramDesc::Size)
+      .def("serialize_to_string", SerializeMessage<ProgramDesc>)
+      .def("parse_from_string",
+           [](ProgramDesc &program_desc, const std::string &data) {
+             proto::ProgramDesc *desc = program_desc.Proto();
+             PADDLE_ENFORCE(desc->ParseFromString(data),
+                            "Fail to parse ProgramDesc from string. This could "
+                            "be a bug of Paddle.");
+           });
+}
+
+void BindBlockDesc(py::module &m) {
+  py::class_<BlockDesc>(m, "BlockDesc", "")
+      .def_property_readonly("id", &BlockDesc::ID)
+      .def_property_readonly("parent", &BlockDesc::Parent)
+      .def("append_op", &BlockDesc::AppendOp,
+           py::return_value_policy::reference)
+      .def("prepend_op", &BlockDesc::PrependOp,
+           py::return_value_policy::reference)
+      .def("remove_op", &BlockDesc::RemoveOp)
+      .def("var",
+           [](BlockDesc &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.Var(name);
+           },
+           py::return_value_policy::reference)
+      .def("has_var",
+           [](BlockDesc &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.HasVar(name);
+           })
+      .def("has_var_recursive",
+           [](BlockDesc &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.HasVarRecursive(name);
+           })
+      .def("find_var",
+           [](BlockDesc &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.FindVar(name);
+           },
+           py::return_value_policy::reference)
+      .def("find_var_recursive",
+           [](BlockDesc &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.FindVarRecursive(name);
+           },
+           py::return_value_policy::reference)
+      .def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference)
+      .def("op_size", &BlockDesc::OpSize)
+      .def("op", &BlockDesc::Op, py::return_value_policy::reference)
+      .def("serialize_to_string", SerializeMessage<BlockDesc>);
+}
+
+void BindVarDsec(py::module &m) {
+  py::enum_<proto::DataType>(m, "DataType", "")
+      .value("BOOL", proto::DataType::BOOL)
+      .value("INT16", proto::DataType::INT16)
+      .value("INT32", proto::DataType::INT32)
+      .value("INT64", proto::DataType::INT64)
+      .value("FP16", proto::DataType::FP16)
+      .value("FP32", proto::DataType::FP32)
+      .value("FP64", proto::DataType::FP64);
+
+  py::class_<VarDesc> var_desc(m, "VarDesc", "");
+  var_desc
+      .def("name",
+           [](const VarDesc &self) {
+             py::bytes name = self.Name();
+             return name;
+           },
+           py::return_value_policy::reference)
+      .def("set_name", &VarDesc::SetName)
+      .def("set_shape", &VarDesc::SetShape)
+      .def("set_shapes", &VarDesc::SetShapes)
+      .def("set_dtype", &VarDesc::SetDataType)
+      .def("set_dtypes", &VarDesc::SetDataTypes)
+      .def("shape", &VarDesc::GetShape, py::return_value_policy::reference)
+      .def("shapes", &VarDesc::GetShapes, py::return_value_policy::reference)
+      .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
+      .def("dtypes", &VarDesc::GetDataTypes, py::return_value_policy::reference)
+      .def("lod_level", &VarDesc::GetLoDLevel)
+      .def("lod_levels", &VarDesc::GetLoDLevels,
+           py::return_value_policy::reference)
+      .def("set_lod_level", &VarDesc::SetLoDLevel)
+      .def("set_lod_levels", &VarDesc::SetLoDLevels)
+      .def("type", &VarDesc::GetType)
+      .def("set_type", &VarDesc::SetType)
+      .def("serialize_to_string", SerializeMessage<VarDesc>)
+      .def("persistable", &VarDesc::Persistable)
+      .def("set_persistable", &VarDesc::SetPersistable);
+
+  py::enum_<proto::VarDesc::VarType>(var_desc, "VarType", "")
+      .value("LOD_TENSOR", proto::VarDesc::LOD_TENSOR)
+      .value("SELECTED_ROWS", proto::VarDesc::SELECTED_ROWS)
+      .value("FEED_MINIBATCH", proto::VarDesc::FEED_MINIBATCH)
+      .value("FETCH_LIST", proto::VarDesc::FETCH_LIST)
+      .value("STEP_SCOPES", proto::VarDesc::STEP_SCOPES)
+      .value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE)
+      .value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY)
+      .value("PLACE_LIST", proto::VarDesc::PLACE_LIST)
+      .value("READER", proto::VarDesc::READER)
+      .value("NCCL_COM", proto::VarDesc::NCCL_COM);
+}
+
+void BindOpDesc(py::module &m) {
+  py::enum_<proto::AttrType>(m, "AttrType", "")
+      .value("INT", proto::AttrType::INT)
+      .value("INTS", proto::AttrType::INTS)
+      .value("FLOAT", proto::AttrType::FLOAT)
+      .value("FLOATS", proto::AttrType::FLOATS)
+      .value("STRING", proto::AttrType::STRING)
+      .value("STRINGS", proto::AttrType::STRINGS)
+      .value("BOOL", proto::AttrType::BOOLEAN)
+      .value("BOOLS", proto::AttrType::BOOLEANS)
+      .value("BLOCK", proto::AttrType::BLOCK);
+
+  py::class_<OpDesc> op_desc(m, "OpDesc", "");
+  op_desc
+      .def("__init__", [](OpDesc &self) { new (&self) OpDesc(); },
+           py::return_value_policy::reference)
+      .def("copy_from", &OpDesc::CopyFrom)
+      .def("type", &OpDesc::Type)
+      .def("set_type", &OpDesc::SetType)
+      .def("input", &OpDesc::Input)
+      .def("input_names", &OpDesc::InputNames)
+      .def("output", &OpDesc::Output)
+      .def("output_names", &OpDesc::OutputNames)
+      .def("set_input", &OpDesc::SetInput)
+      .def("set_output", &OpDesc::SetOutput)
+      .def("input_arg_names", &OpDesc::InputArgumentNames)
+      .def("output_arg_names", &OpDesc::OutputArgumentNames)
+      .def("rename_input", &OpDesc::RenameInput)
+      .def("rename_output", &OpDesc::RenameOutput)
+      .def("has_attr", &OpDesc::HasAttr)
+      .def("attr_type", &OpDesc::GetAttrType)
+      .def("attr_names", &OpDesc::AttrNames)
+      .def("set_attr", &OpDesc::SetAttr)
+      .def("attr", &OpDesc::GetAttr)
+      .def("set_block_attr", &OpDesc::SetBlockAttr)
+      .def("set_serialized_attr",
+           [](OpDesc &self, const std::string &name,
+              const py::bytes &seriralized) {
+             std::string ser(seriralized);
+             self.SetAttr(name, ser);
+           })
+      .def("block_attr", &OpDesc::GetBlockAttr)
+      .def("check_attrs", &OpDesc::CheckAttrs)
+      .def("infer_shape", &OpDesc::InferShape)
+      .def("infer_var_type", &OpDesc::InferVarType)
+      .def("serialize_to_string", SerializeMessage<OpDesc>)
+      .def("block", &OpDesc::Block, py::return_value_policy::reference);
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/protobuf.h b/paddle/fluid/pybind/protobuf.h
new file mode 100644
index 0000000000000000000000000000000000000000..c828e4583d259a462ece4145c7beb75e249e8429
--- /dev/null
+++ b/paddle/fluid/pybind/protobuf.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <Python.h>
+#include <fstream>
+#include <vector>
+#include "paddle/fluid/platform/variant.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+void BindProgramDesc(py::module& m);
+void BindBlockDesc(py::module& m);
+void BindVarDsec(py::module& m);
+void BindOpDesc(py::module& m);
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8924aabd17b18504a863990e1196e664905bbb52
--- /dev/null
+++ b/paddle/fluid/pybind/pybind.cc
@@ -0,0 +1,484 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/pybind/protobuf.h"
+
+#include <mutex>  // for call_once
+#include <unordered_map>
+#include "paddle/fluid/framework/backward.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/prune.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/cond_op.h"
+#include "paddle/fluid/operators/net_op.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/pybind/const_value.h"
+#include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/pybind.h"
+#include "paddle/fluid/pybind/tensor_py.h"
+#include "paddle/string/to_string.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#include "paddle/fluid/platform/cuda_profiler.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#endif
+
+// disable auto conversion to list in Python
+PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
+
+namespace paddle {
+namespace pybind {
+static size_t UniqueIntegerGenerator(const std::string &prefix) {
+  static std::unordered_map<std::string, std::atomic<size_t>> generators;
+  return generators[prefix].fetch_add(1);
+}
+
+bool IsCompiledWithCUDA() {
+#ifndef PADDLE_WITH_CUDA
+  return false;
+#else
+  return true;
+#endif
+}
+
+PYBIND11_PLUGIN(core) {
+  py::module m("core", "C++ core of PaddlePaddle");
+
+  // using framework in this function. Since it is inside a function, it will
+  // not cause namespace pollution.
+  using namespace paddle::framework;  // NOLINT
+
+  BindException(m);
+
+  py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
+      .def_buffer(
+          [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
+      .def("get_dims",
+           [](const Tensor &self) { return vectorize(self.dims()); })
+      .def("set_dims",
+           [](Tensor &self, const std::vector<int64_t> &dim) {
+             self.Resize(make_ddim(dim));
+           })
+      .def("set_layout",
+           [](Tensor &self, const std::string &layout) {
+             self.set_layout(StringToDataLayout(layout));
+           })
+      .def("alloc_float",
+           [](Tensor &self, paddle::platform::CUDAPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("alloc_float",
+           [](Tensor &self, paddle::platform::CPUPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("alloc_int",
+           [](Tensor &self, paddle::platform::CPUPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("alloc_int",
+           [](Tensor &self, paddle::platform::CUDAPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("set", PyCPUTensorSetFromArray<float>)
+      .def("set", PyCPUTensorSetFromArray<int>)
+      .def("set", PyCPUTensorSetFromArray<double>)
+      .def("set", PyCPUTensorSetFromArray<int64_t>)
+      .def("set", PyCPUTensorSetFromArray<bool>)
+#ifdef PADDLE_WITH_CUDA
+      .def("set", PyCUDATensorSetFromArray<float>)
+      .def("set", PyCUDATensorSetFromArray<int>)
+      .def("set", PyCUDATensorSetFromArray<double>)
+      .def("set", PyCUDATensorSetFromArray<int64_t>)
+      .def("set", PyCUDATensorSetFromArray<bool>)
+#endif
+      .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
+      .def("set_float_element", TensorSetElement<float>)
+      .def("get_float_element", TensorGetElement<float>)
+      .def("set_double_element", TensorSetElement<double>)
+      .def("get_double_element", TensorGetElement<double>)
+      .def("dtype", [](Tensor &self) { return ToDataType(self.type()); });
+
+  py::class_<LoDTensor, Tensor>(m, "LoDTensor")
+      .def_buffer(
+          [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
+      .def(
+          "__init__",
+          [](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) {
+            LoD new_lod;
+            new_lod.reserve(lod.size());
+            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+            new (&instance) LoDTensor(new_lod);
+          })
+      .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
+      .def("set_lod",
+           [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
+             LoD new_lod;
+             new_lod.reserve(lod.size());
+             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+             self.set_lod(new_lod);
+           })
+      .def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
+        auto lod = self.lod();
+        std::vector<std::vector<size_t>> new_lod;
+        new_lod.reserve(lod.size());
+        std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+        return new_lod;
+      });
+
+  py::class_<SelectedRows>(m, "SelectedRows")
+      .def("__init__",
+           [](SelectedRows &instance) { new (&instance) SelectedRows(); })
+      .def("__init__",
+           [](SelectedRows &instance, const std::vector<int64_t> rows,
+              const int64_t &height) {
+             new (&instance) SelectedRows(rows, height);
+           })
+      .def("get_tensor",
+           [](SelectedRows &self) { return self.mutable_value(); },
+           py::return_value_policy::reference)
+      .def("set_height", &SelectedRows::set_height)
+      .def("height", &SelectedRows::height)
+      .def("set_rows",
+           [](SelectedRows &self, std::vector<int64_t> rows) {
+#ifndef PADDLE_WITH_CUDA
+             self.set_rows(rows);
+#else
+        Vector<int64_t> new_rows(rows);
+        self.set_rows(new_rows);
+#endif
+           })
+      .def("rows", [](SelectedRows &self) {
+#ifndef PADDLE_WITH_CUDA
+        return self.rows();
+#else
+         auto rows = self.rows();
+         std::vector<int64_t> new_rows;
+         new_rows.reserve(rows.size());
+         std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
+         return new_rows;
+#endif
+      });
+
+  py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
+
+All parameter, weight, gradient are variables in Paddle.
+)DOC")
+      .def("is_int", [](const Variable &var) { return var.IsType<int>(); })
+      .def("set_int",
+           [](Variable &var, int val) -> void { *var.GetMutable<int>() = val; })
+      .def("get_int", [](const Variable &var) -> int { return var.Get<int>(); })
+      .def("is_float", [](const Variable &var) { return var.IsType<float>(); })
+      .def("set_float",
+           [](Variable &var, float val) -> void {
+             *var.GetMutable<float>() = val;
+           })
+      .def("get_float",
+           [](const Variable &var) -> float { return var.Get<float>(); })
+      .def("get_tensor",
+           [](Variable &self) -> LoDTensor * {
+             return self.GetMutable<LoDTensor>();
+           },
+           py::return_value_policy::reference)
+      .def("get_lod_rank_table",
+           [](Variable &self) { return self.GetMutable<LoDRankTable>(); },
+           py::return_value_policy::reference)
+      .def("get_selected_rows",
+           [](Variable &self) -> SelectedRows * {
+             return self.GetMutable<SelectedRows>();
+           },
+           py::return_value_policy::reference)
+      .def("get_lod_tensor_array",
+           [](Variable &self) { return self.GetMutable<LoDTensorArray>(); },
+           py::return_value_policy::reference)
+#ifdef PADDLE_WITH_CUDA
+      .def("get_communicator",
+           [](Variable &self) -> platform::Communicator * {
+             return self.GetMutable<platform::Communicator>();
+           },
+           py::return_value_policy::reference)
+#endif
+      .def("get_net",
+           [](Variable &self) -> operators::NetOp * {
+             return self.GetMutable<operators::NetOp>();
+           },
+           py::return_value_policy::reference);
+
+  py::class_<Scope>(m, "Scope", "")
+      .def("var",
+           [](Scope &self, const std::string &name) -> Variable * {
+             return self.Var(name);
+           },
+           py::return_value_policy::reference)
+      .def("find_var", &Scope::FindVar, py::return_value_policy::reference)
+      .def(py::init<>())
+      .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
+           py::return_value_policy::reference)
+      .def("drop_kids", &Scope::DropKids);
+
+  //! @note: Be careful! PyBind will return std::string as an unicode, not
+  //! Python str. If you want a str object, you should cast them in Python.
+  m.def("get_all_op_protos", []() -> std::vector<py::bytes> {
+    std::vector<py::bytes> ret_values;
+    for (auto &iter : OpInfoMap::Instance().map()) {
+      auto &info = iter.second;
+      if (info.HasOpProtoAndChecker()) {
+        std::string str;
+        PADDLE_ENFORCE(
+            info.Proto().SerializeToString(&str),
+            "Serialize OpProto Error. This could be a bug of Paddle.");
+        ret_values.emplace_back(str);
+      }
+    }
+    return ret_values;
+  });
+  m.def(
+      "get_grad_op_desc", [](const OpDesc &op_desc,
+                             const std::unordered_set<std::string> &no_grad_set,
+                             const std::vector<BlockDesc *> &grad_sub_block) {
+        std::unordered_map<std::string, std::string> grad_to_var;
+        std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
+            framework::OpInfoMap::Instance()
+                .Get(op_desc.Type())
+                .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
+                               grad_sub_block);
+        std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
+        std::transform(grad_op_descs.begin(), grad_op_descs.end(),
+                       grad_op_desc_ptrs.begin(),
+                       [](std::unique_ptr<OpDesc> &p) { return p.release(); });
+        return std::make_pair(grad_op_desc_ptrs, grad_to_var);
+      });
+  m.def("prune", [](const ProgramDesc &origin,
+                    const std::vector<std::array<size_t, 2>> &targets) {
+    ProgramDesc prog_with_targets(origin);
+    for (const auto &t : targets) {
+      prog_with_targets.MutableBlock(t[0])->Op(t[1])->MarkAsTarget();
+    }
+    proto::ProgramDesc pruned_desc;
+    Prune(*prog_with_targets.Proto(), &pruned_desc);
+    return new ProgramDesc(pruned_desc);
+  });
+  m.def("inference_optimize", [](ProgramDesc &origin) {
+    proto::ProgramDesc pruned_desc;
+    InferenceOptimize(*(origin.Proto()), &pruned_desc);
+    return new ProgramDesc(pruned_desc);
+  });
+  m.def("empty_var_name", []() { return framework::kEmptyVarName; });
+  m.def("grad_var_suffix", []() { return framework::kGradVarSuffix; });
+  m.def_submodule(
+       "var_names",
+       "The module will return special predefined variable name in Paddle")
+      .def("empty", []() { return kEmptyVarName; })
+      .def("temp", []() { return kTempVarName; });
+  // clang-format off
+  py::class_<paddle::platform::DeviceContext>(m, "DeviceContext")
+      .def_static("create",
+                  [](paddle::platform::CPUPlace& place)
+                      -> paddle::platform::DeviceContext* {
+                    return new paddle::platform::CPUDeviceContext();
+                  })
+      .def_static("create",
+                  [](paddle::platform::CUDAPlace& place)
+                      -> paddle::platform::DeviceContext* {
+#ifndef PADDLE_WITH_CUDA
+                    PADDLE_THROW("CUDAPlace is not supported in CPU device.");
+#else
+                    return new paddle::platform::CUDADeviceContext(place);
+#endif
+                  });
+// clang-format on
+
+#ifdef PADDLE_WITH_CUDA
+  py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
+#endif
+  py::class_<platform::CUDAPlace>(m, "CUDAPlace")
+      .def(py::init<int>())
+      .def("__str__", string::to_string<const platform::CUDAPlace &>);
+
+  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
+      .def(py::init<>())
+      .def("__str__", string::to_string<const platform::CPUPlace &>);
+
+  py::class_<platform::Place>(m, "Place")
+      .def(py::init<>())
+      .def("set_place",
+           [](platform::Place &self, const platform::CPUPlace &cpu_place) {
+             self = cpu_place;
+           })
+      .def("set_place",
+           [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
+             self = gpu_place;
+           });
+
+  py::class_<OperatorBase>(m, "Operator")
+      .def_static("create",
+                  [](py::bytes protobin) {
+                    proto::OpDesc desc;
+                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                                   "Cannot parse user input to OpDesc");
+                    PADDLE_ENFORCE(desc.IsInitialized(),
+                                   "User OpDesc is not initialized, reason %s",
+                                   desc.InitializationErrorString());
+                    return OpRegistry::CreateOp(desc);
+                  })
+      .def("backward",
+           [](const OperatorBase &forwardOp,
+              const std::unordered_set<std::string> &no_grad_vars) {
+             return Backward(forwardOp, no_grad_vars).release();
+           })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::CPUPlace &place) { self.Run(scope, place); })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::CUDAPlace &place) { self.Run(scope, place); })
+      .def("type",
+           [](const OperatorBase &op) -> std::string { return op.Type(); })
+      .def("outputs",
+           [](const OperatorBase &op)
+               -> std::map<std::string, std::vector<std::string>> {
+                 return op.Outputs();
+               })
+      .def("output_vars",
+           [](const OperatorBase &op) { return op.OutputVars(true); })
+      .def("inputs", [](const OperatorBase &op) { return op.Inputs(); })
+      .def("input_vars", [](const OperatorBase &op) { return op.InputVars(); })
+      .def("__str__", &OperatorBase::DebugString)
+      .def("no_intermediate_outputs",
+           [](const OperatorBase &op) { return op.OutputVars(false); })
+      .def("support_gpu", &OperatorBase::SupportGPU);
+
+  py::class_<operators::NetOp, OperatorBase>(m, "Net")
+      .def_static("create",
+                  []() -> operators::NetOp * {
+                    auto *retv = new operators::NetOp;
+                    retv->SetType("plain_net");
+                    return retv;
+                  })
+      .def("append_op", [](operators::NetOp &self,
+                           const OperatorBase &op) { self.AppendOp(op); })
+      .def("complete_add_op", &operators::NetOp::CompleteAddOp)
+      .def("complete_add_op", [](std::shared_ptr<operators::NetOp> &self) {
+        self->CompleteAddOp();
+      });
+
+  // cond_op
+  py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
+      .def_static("create",
+                  [](py::bytes protobin) -> operators::CondOp * {
+                    proto::OpDesc desc;
+                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                                   "Cannot parse user input to OpDesc");
+                    PADDLE_ENFORCE(desc.IsInitialized(),
+                                   "User OpDesc is not initialized, reason %s",
+                                   desc.InitializationErrorString());
+                    auto cond_op = OpRegistry::CreateOp(desc);
+                    return static_cast<operators::CondOp *>(cond_op.release());
+                  })
+      .def("set_truenet",
+           [](operators::CondOp &self, const operators::NetOp &net) -> void {
+             self.set_truenet(net.Clone());
+           })
+      .def("set_falsenet",
+           [](operators::CondOp &self, const operators::NetOp &net) -> void {
+             self.set_falsenet(net.Clone());
+           });
+
+  py::class_<framework::Executor>(m, "Executor")
+      .def(py::init<const platform::Place &>())
+      .def("run",
+           (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) &
+               Executor::Run);
+
+  m.def("unique_integer", UniqueIntegerGenerator);
+  m.def("init_gflags", framework::InitGflags);
+  m.def("init_glog", framework::InitGLOG);
+  m.def("init_devices", &framework::InitDevices);
+
+  m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+
+  m.def("set_feed_variable", framework::SetFeedVariable);
+  m.def("get_fetch_variable", framework::GetFetchVariable);
+
+  BindProgramDesc(m);
+  BindBlockDesc(m);
+  BindVarDsec(m);
+  BindOpDesc(m);
+  BindConstValue(m);
+
+  py::class_<framework::LoDRankTable>(m, "LodRankTable")
+      .def("items", [](framework::LoDRankTable &table) {
+        std::vector<std::pair<size_t, size_t>> res;
+        for (auto &item : table.items()) {
+          res.push_back({item.index, item.length});
+        }
+        return res;
+      });
+
+  py::class_<LoDTensorArray>(m, "LoDTensorArray")
+      .def("__getitem__",
+           [](LoDTensorArray &self, size_t i) { return &self.at(i); },
+           py::return_value_policy::reference)
+      .def("__len__", [](LoDTensorArray &self) { return self.size(); })
+      .def("__setitem__",
+           [](LoDTensorArray &self, size_t i, const LoDTensor &t) {
+             PADDLE_ENFORCE_LT(i, self.size());
+             self[i].ShareDataWith(t);
+             self[i].set_lod(t.lod());
+           })
+      .def("append", [](LoDTensorArray &self, const LoDTensor &t) {
+        self.emplace_back();
+        self.back().ShareDataWith(t);
+        self.back().set_lod(t.lod());
+      });
+
+  m.def("op_support_gpu", OpSupportGPU);
+#ifdef PADDLE_WITH_CUDA
+  m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
+
+  m.def("nvprof_init", platform::CudaProfilerInit);
+  m.def("nvprof_start", platform::CudaProfilerStart);
+  m.def("nvprof_stop", platform::CudaProfilerStop);
+#endif
+
+  py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
+      .value("kDisabled", platform::ProfilerState::kDisabled)
+      .value("kCPU", platform::ProfilerState::kCPU)
+      .value("kCUDA", platform::ProfilerState::kCUDA)
+      .export_values();
+
+  py::enum_<platform::EventSortingKey>(m, "EventSortingKey", py::arithmetic())
+      .value("kDefault", platform::EventSortingKey::kDefault)
+      .value("kCalls", platform::EventSortingKey::kCalls)
+      .value("kTotal", platform::EventSortingKey::kTotal)
+      .value("kMin", platform::EventSortingKey::kMin)
+      .value("kMax", platform::EventSortingKey::kMax)
+      .value("kAve", platform::EventSortingKey::kAve)
+      .export_values();
+
+  m.def("enable_profiler", platform::EnableProfiler);
+  m.def("disable_profiler", platform::DisableProfiler);
+  m.def("reset_profiler", platform::ResetProfiler);
+  return m.ptr();
+}
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
new file mode 100644
index 0000000000000000000000000000000000000000..0261709f1e47afce268471e1c7586937b6d4a6b9
--- /dev/null
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -0,0 +1,163 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+
+namespace pybind {
+
+namespace details {
+
+template <bool less, size_t I, typename... ARGS>
+struct CastToPyBufferImpl;
+
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<false, I, ARGS...> {
+  py::buffer_info operator()(framework::Tensor &tensor) {
+    PADDLE_THROW("This type of tensor cannot be expose to Python");
+    return py::buffer_info();
+  }
+};
+
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<true, I, ARGS...> {
+  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
+  py::buffer_info operator()(framework::Tensor &tensor) {
+    if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
+      auto dim_vec = framework::vectorize(tensor.dims());
+      std::vector<size_t> dims_outside;
+      std::vector<size_t> strides;
+      dims_outside.resize(dim_vec.size());
+      strides.resize(dim_vec.size());
+
+      size_t prod = 1;
+      for (size_t i = dim_vec.size(); i != 0; --i) {
+        dims_outside[i - 1] = (size_t)dim_vec[i - 1];
+        strides[i - 1] = sizeof(CUR_TYPE) * prod;
+        prod *= dims_outside[i - 1];
+      }
+      framework::Tensor dst_tensor;
+      if (paddle::platform::is_gpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+        auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
+        auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
+            tensor.dims(), platform::CPUPlace()));
+
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
+            pool.Get(tensor.place()));
+
+        paddle::platform::GpuMemcpyAsync(
+            dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
+            cudaMemcpyDeviceToHost, dev_ctx->stream());
+#else
+        PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
+      } else if (paddle::platform::is_cpu_place(tensor.place())) {
+        dst_tensor = tensor;
+      }
+      return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
+                             py::format_descriptor<CUR_TYPE>::format(),
+                             (size_t)framework::arity(dst_tensor.dims()),
+                             dims_outside, strides);
+    } else {
+      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
+      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
+    }
+  }
+};
+}  // namespace details
+inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
+  auto buffer_info =
+      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool>()(
+          tensor);
+  return buffer_info;
+}
+
+template <typename T>
+T TensorGetElement(framework::Tensor &self, size_t offset) {
+  if (platform::is_cpu_place(self.place())) {
+    return self.data<T>()[offset];
+  } else {
+    std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
+    framework::Copy(self, platform::CPUPlace(), dst.get());
+    return dst->data<T>()[offset];
+  }
+}
+
+// TODO(dzhwinter) : fix the redundent Tensor allocate and free
+template <typename T>
+void TensorSetElement(framework::Tensor &self, size_t offset, T elem) {
+  if (platform::is_gpu_place(self.place())) {
+    std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
+    framework::Copy(self, platform::CPUPlace(), dst.get());
+    dst->data<T>()[offset] = elem;
+    framework::Copy(*dst.get(), self.place(), &self);
+
+  } else if (platform::is_cpu_place(self.place())) {
+    self.data<T>()[offset] = elem;
+  }
+}
+
+template <typename T>
+void PyCPUTensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<T, py::array::c_style | py::array::forcecast> array,
+    paddle::platform::CPUPlace &place) {
+  std::vector<int64_t> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+
+  self.Resize(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<T>(place);
+  std::memcpy(dst, array.data(), sizeof(T) * array.size());
+}
+
+#ifdef PADDLE_WITH_CUDA
+template <typename T>
+void PyCUDATensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<T, py::array::c_style | py::array::forcecast> array,
+    paddle::platform::CUDAPlace &place) {
+  std::vector<int64_t> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+
+  self.Resize(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<T>(place);
+
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto dev_ctx =
+      static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
+  paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
+                                   cudaMemcpyHostToDevice, dev_ctx->stream());
+}
+#endif
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/framework/attribute.cc b/paddle/framework/attribute.cc
deleted file mode 100644
index 5074e8f5a05ed4e824b3db7e506b30eb1b70c3fd..0000000000000000000000000000000000000000
--- a/paddle/framework/attribute.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/attribute.h"
-
-#include <vector>
-
-namespace paddle {
-namespace framework {
-
-Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
-  switch (attr_desc.type()) {
-    case proto::AttrType::BOOLEAN: {
-      return attr_desc.b();
-    }
-    case proto::AttrType::INT: {
-      return attr_desc.i();
-    }
-    case proto::AttrType::FLOAT: {
-      return attr_desc.f();
-    }
-    case proto::AttrType::STRING: {
-      return attr_desc.s();
-    }
-    case proto::AttrType::BOOLEANS: {
-      std::vector<bool> val(attr_desc.bools_size());
-      for (int i = 0; i < attr_desc.bools_size(); ++i) {
-        val[i] = attr_desc.bools(i);
-      }
-      return val;
-    }
-    case proto::AttrType::INTS: {
-      std::vector<int> val(attr_desc.ints_size());
-      for (int i = 0; i < attr_desc.ints_size(); ++i) {
-        val[i] = attr_desc.ints(i);
-      }
-      return val;
-    }
-    case proto::AttrType::FLOATS: {
-      std::vector<float> val(attr_desc.floats_size());
-      for (int i = 0; i < attr_desc.floats_size(); ++i) {
-        val[i] = attr_desc.floats(i);
-      }
-      return val;
-    }
-    case proto::AttrType::STRINGS: {
-      std::vector<std::string> val(attr_desc.strings_size());
-      for (int i = 0; i < attr_desc.strings_size(); ++i) {
-        val[i] = attr_desc.strings(i);
-      }
-      return val;
-    }
-    case proto::AttrType::LONG: {
-      return attr_desc.l();
-    }
-    default:
-      PADDLE_THROW("Unsupport attr type %d", attr_desc.type());
-  }
-  return boost::blank();
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h
deleted file mode 100644
index bcff9bc4c48f8f233b7f811640c2789f9618a972..0000000000000000000000000000000000000000
--- a/paddle/framework/attribute.h
+++ /dev/null
@@ -1,284 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/type_defs.h"
-#include "paddle/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-template <typename T>
-inline proto::AttrType AttrTypeID() {
-  Attribute tmp = T();
-  return static_cast<proto::AttrType>(tmp.which() - 1);
-}
-
-Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc);
-
-class AttrReader {
- public:
-  explicit AttrReader(const AttributeMap& attrs) : attrs_(attrs) {}
-
-  template <typename T>
-  inline const T& Get(const std::string& name) const {
-    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
-                   name);
-    return boost::get<T>(attrs_.at(name));
-  }
-
- private:
-  const AttributeMap& attrs_;
-};
-
-// check whether a value(attribute) fit a certain limit
-template <typename T>
-class GreaterThanChecker {
- public:
-  explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
-  void operator()(T& value) const {
-    PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails.");
-  }
-
- private:
-  T lower_bound_;
-};
-
-template <typename T>
-class EqualGreaterThanChecker {
- public:
-  explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
-  void operator()(T& value) const {
-    PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails.");
-  }
-
- private:
-  T lower_bound_;
-};
-
-// we can provide users more common Checker, like 'LessThanChecker',
-// 'BetweenChecker'...
-
-template <typename T>
-class DefaultValueSetter {
- public:
-  explicit DefaultValueSetter(T default_value)
-      : default_value_(default_value) {}
-  void operator()(T& value) const { value = default_value_; }
-
- private:
-  T default_value_;
-};
-
-template <typename T>
-class EnumInContainer {
- public:
-  explicit EnumInContainer(const std::unordered_set<T>& c) : container_(c) {}
-  void operator()(T& val) const {
-    PADDLE_ENFORCE(container_.find(val) != container_.end(),
-                   "Value %s is not in enum container %s", val,
-                   ContainerDebugString());
-  }
-
- private:
-  std::string ContainerDebugString() const {
-    std::ostringstream sout;
-    sout << "[";
-    size_t cnt = 0;
-    for (auto& v : container_) {
-      sout << v;
-      ++cnt;
-      if (cnt != container_.size()) {
-        sout << " ,";
-      }
-    }
-    sout << "]";
-    return sout.str();
-  }
-
-  std::unordered_set<T> container_;
-};
-
-template <typename T>
-struct ExtractAttribute {
-  explicit ExtractAttribute(const std::string& attr_name)
-      : attr_name_(attr_name) {}
-
-  T* operator()(Attribute& attr) const {
-    T* attr_value = nullptr;
-    try {
-      attr_value = &boost::get<T>(attr);
-    } catch (boost::bad_get& bad_get) {
-      PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s",
-                   attr_name_, typeid(T).name(), attr.type().name());
-    }
-    return attr_value;
-  }
-
-  const std::string& attr_name_;
-};
-
-// special handle bool
-// FIXME(yuyang18): Currently we cast bool into int in python binding. It is
-// hard to change the logic there. In another way, we should correct handle
-// if the user set `some_flag=1`.
-//
-// FIX ME anytime if there is a better solution.
-template <>
-struct ExtractAttribute<bool> {
-  explicit ExtractAttribute(const std::string& attr_name)
-      : attr_name_(attr_name) {}
-
-  bool* operator()(Attribute& attr) const {
-    if (attr.type() == typeid(int)) {  // NOLINT
-      int val = boost::get<int>(attr);
-      attr = static_cast<bool>(val);
-    } else if (attr.type() == typeid(float)) {  // NOLINT
-      float val = boost::get<float>(attr);
-      attr = static_cast<bool>(val);
-    }
-    bool* attr_value = nullptr;
-    try {
-      attr_value = &boost::get<bool>(attr);
-    } catch (boost::bad_get& bad_get) {
-      PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s",
-                   attr_name_, attr.type().name());
-    }
-    return attr_value;
-  }
-
-  const std::string& attr_name_;
-};
-
-template <>
-struct ExtractAttribute<int64_t> {
-  explicit ExtractAttribute(const std::string& attr_name)
-      : attr_name_(attr_name) {}
-
-  int64_t* operator()(Attribute& attr) const {
-    if (attr.type() == typeid(int)) {  // NOLINT
-      int val = boost::get<int>(attr);
-      attr = static_cast<int64_t>(val);
-    } else if (attr.type() == typeid(float)) {  // NOLINT
-      int val = boost::get<float>(attr);
-      attr = static_cast<int64_t>(val);
-    }
-    int64_t* attr_value = nullptr;
-    try {
-      attr_value = &boost::get<int64_t>(attr);
-    } catch (boost::bad_get& bad_get) {
-      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
-                   attr_name_, attr.type().name());
-    }
-    return attr_value;
-  }
-
-  const std::string& attr_name_;
-};
-
-// check whether a certain attribute fit its limits
-// an attribute can have more than one limits
-template <typename T>
-class TypedAttrChecker {
-  typedef std::function<void(T&)> ValueChecker;
-
- public:
-  explicit TypedAttrChecker(const std::string& attr_name)
-      : attr_name_(attr_name) {}
-
-  TypedAttrChecker& InEnum(const std::unordered_set<T>& range) {
-    value_checkers_.push_back(EnumInContainer<T>(range));
-    return *this;
-  }
-
-  TypedAttrChecker& GreaterThan(const T& lower_bound) {
-    value_checkers_.push_back(GreaterThanChecker<T>(lower_bound));
-    return *this;
-  }
-
-  TypedAttrChecker& EqualGreaterThan(const T& lower_bound) {
-    value_checkers_.push_back(EqualGreaterThanChecker<T>(lower_bound));
-    return *this;
-  }
-
-  // we can add more common limits, like LessThan(), Between()...
-
-  TypedAttrChecker& SetDefault(const T& default_value) {
-    PADDLE_ENFORCE(default_value_setter_.empty(),
-                   "%s can't have more than one default value!", attr_name_);
-    default_value_setter_.push_back(DefaultValueSetter<T>(default_value));
-    return *this;
-  }
-
-  // allow users provide their own checker
-  TypedAttrChecker& AddCustomChecker(const ValueChecker& checker) {
-    value_checkers_.push_back(checker);
-    return *this;
-  }
-
-  void operator()(AttributeMap& attr_map) const {
-    if (!attr_map.count(attr_name_)) {
-      // user do not set this attr
-      PADDLE_ENFORCE(!default_value_setter_.empty(),
-                     "Attribute '%s' is required!", attr_name_);
-      // default_value_setter_ has no more than one element
-      T val;
-      (default_value_setter_[0])(val);
-      attr_map[attr_name_] = val;
-    }
-    Attribute& attr = attr_map.at(attr_name_);
-    ExtractAttribute<T> extract_attr(attr_name_);
-    T* attr_value = extract_attr(attr);
-    for (const auto& checker : value_checkers_) {
-      checker(*attr_value);
-    }
-  }
-
- private:
-  std::string attr_name_;
-  std::vector<ValueChecker> value_checkers_;
-  std::vector<ValueChecker> default_value_setter_;
-};
-
-// check whether op's all attributes fit their own limits
-class OpAttrChecker {
-  typedef std::function<void(AttributeMap&)> AttrChecker;
-
- public:
-  template <typename T>
-  TypedAttrChecker<T>& AddAttrChecker(const std::string& attr_name) {
-    attr_checkers_.push_back(TypedAttrChecker<T>(attr_name));
-    AttrChecker& checker = attr_checkers_.back();
-    return *(checker.target<TypedAttrChecker<T>>());
-  }
-
-  void Check(AttributeMap& attr_map) const {
-    for (const auto& checker : attr_checkers_) {
-      checker(attr_map);
-    }
-  }
-
- private:
-  std::vector<AttrChecker> attr_checkers_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
deleted file mode 100644
index f52a51519fceffd96a4e8db240b23d15ed399fff..0000000000000000000000000000000000000000
--- a/paddle/framework/backward.cc
+++ /dev/null
@@ -1,585 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/backward.h"
-#include "paddle/operators/net_op.h"
-
-#include <deque>
-#include <list>
-#include <memory>
-#include <unordered_set>
-
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/net_op.h"
-
-namespace paddle {
-namespace framework {
-
-static std::unordered_set<std::string>* g_ctrl_flow_ops_ = nullptr;
-// Control Flow operators's backward is significantly different from
-// computational operators. Hack Code here.
-// We should design a better way to backward CtrlFlowOps.
-static std::unordered_set<std::string>& CtrlFlowOps() {
-  if (g_ctrl_flow_ops_ == nullptr) {
-    g_ctrl_flow_ops_ = new std::unordered_set<std::string>{
-        "increment", "lod_rank_table", "less_than"};
-  }
-  return *g_ctrl_flow_ops_;
-}
-
-static inline std::unique_ptr<OperatorBase> CreateGradOp(
-    const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
-    std::unordered_map<std::string, std::string>* grad_to_var) {
-  OpDesc op_desc;
-  op_desc.SetInputMap(op.Inputs());
-  op_desc.SetOutputMap(op.Outputs());
-  op_desc.SetType(op.Type());
-  op_desc.SetAttrMap(op.Attrs());
-  auto& info = OpInfoMap::Instance().Get(op.Type());
-  auto grad_descs = info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, {});
-  std::vector<std::unique_ptr<OperatorBase>> grad_ops;
-  grad_ops.reserve(grad_descs.size());
-  std::transform(grad_descs.begin(), grad_descs.end(),
-                 std::back_inserter(grad_ops),
-                 [](const std::unique_ptr<OpDesc>& grad_desc) {
-                   return OpRegistry::CreateOp(*grad_desc);
-                 });
-  PADDLE_ENFORCE(!grad_ops.empty());
-  if (grad_ops.size() == 1) {
-    return std::move(grad_ops[0]);
-  } else {
-    auto net_op = new operators::NetOp();
-    for (auto& grad_op : grad_ops) {
-      net_op->AppendOp(std::move(grad_op));
-    }
-    net_op->CompleteAddOp();
-    return std::unique_ptr<OperatorBase>(net_op);
-  }
-}
-
-template <typename Map, typename T>
-static void ForEachVarName(const Map& names, T callback) {
-  for (auto& name : names) {
-    for (auto& n : name.second) {
-      if (callback(n)) return;
-    }
-  }
-}
-
-// return whether all the names + suffixes in the set
-static bool AllInSet(
-    const std::map<std::string, std::vector<std::string>>& names,
-    const std::string& suffix, const std::unordered_set<std::string>& set) {
-  bool all_in_set = true;
-  ForEachVarName(names, [&all_in_set, &set, &suffix](const std::string& n) {
-    all_in_set = set.find(n + suffix) != set.end();
-    return !all_in_set;
-  });
-  return all_in_set;
-}
-
-static std::unique_ptr<OperatorBase> NOP() {
-  auto net_op = new operators::NetOp();
-  net_op->SetType("@NOP@");
-  net_op->CompleteAddOp();
-  return std::unique_ptr<OperatorBase>(net_op);
-}
-
-//  Get backward operator from a forward operator, a recursive implementation.
-//
-//  no_grad_names the gradient variable names without gradient calculating.
-//
-//  uniq_id is a unique index used inside recursively calling
-//  BackwardRecursive. use `uid = uniq_id++;` to get the unique index, and
-//  pass `uniq_id` through recursive calling.
-//
-//  returns The backward operator. In a simple situation, it may be a simple
-//  operator, in a complex situation, it maybe a NetOp.
-//
-//  See Backward.h for details
-static std::unique_ptr<OperatorBase> BackwardRecursive(
-    const OperatorBase& forwardOp,
-    std::unordered_set<std::string>& no_grad_names,
-    std::unordered_map<std::string, std::string>* grad_to_var,
-    size_t& uniq_id) {
-  //  If all input gradients of forwarding operator do not need to calculate,
-  //  just return an NOP. Not return null ptr because NOP does not take
-  //  too much time for calculation, but it is useful for simplifying logic.
-  if (AllInSet(forwardOp.Inputs() /*names*/, kGradVarSuffix /*suffix*/,
-               no_grad_names /*set*/)) {
-    return NOP();
-  }
-
-  //  All output gradients of forwarding operator do not need to calculate.
-  //  Then all input gradients cannot be computed at all, and we put them into
-  //  `no_grad_names` set. Return an NOP.
-  if (AllInSet(forwardOp.Outputs() /*names*/, kGradVarSuffix /*suffix*/,
-               no_grad_names /*set*/)) {
-    ForEachVarName(forwardOp.Inputs(),
-                   [&no_grad_names](const std::string& name) -> bool {
-                     no_grad_names.insert(GradVarName(name));
-                     return false;
-                   });
-    return NOP();
-  }
-
-  // Returned gradient network
-  auto net = std::unique_ptr<operators::NetOp>(new operators::NetOp());
-
-  if (forwardOp.IsNetOp()) {
-    // Because forwardOp is a net op, it can static_cast.
-    auto& forwardNet = static_cast<const operators::NetOp&>(forwardOp);
-
-    // Map from output gradient variable name to operator's indices in
-    // backward net's ops_. That operator generates that variable.
-    std::unordered_map<std::string, std::vector<size_t>> dup_output_ops;
-
-    size_t local_op_id = 0;
-    // reversely travel forwardNet and collect all duplicate outputs.
-    for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend();
-         ++it, ++local_op_id) {
-      auto& fwd = *it;
-      auto bwd = BackwardRecursive(*fwd, no_grad_names, grad_to_var, uniq_id);
-      ForEachVarName(bwd->Outputs(),
-                     [&dup_output_ops, local_op_id](const std::string& out) {
-                       dup_output_ops[out].emplace_back(local_op_id);
-                       return false;
-                     });
-      net->AppendOp(std::move(bwd));
-    }
-    // Get unique ID for this method.
-    auto uid = uniq_id++;
-    // TODO(dzh): more comment
-    // multiple operators which have the same output (y for example) may
-    // overwrite the same y variable when backward, special operations are token
-    // to handle this case. For each duplicate output, rename it to an alias
-    // (original name with a offset), append an `add` op for its operator,
-    // and finally sum all the alias variable to the final output variable y.
-    using Pos = std::pair<size_t, std::unique_ptr<OperatorBase>>;
-    std::list<Pos> insert_position;
-    for (auto& dup_output_op : dup_output_ops) {
-      const std::string& name = dup_output_op.first;
-      // duplicate @Empty@ don't need to be added
-      if (name == kEmptyVarName) continue;
-
-      auto& dup_op = dup_output_op.second;
-      // no duplicate output
-      if (dup_op.size() == 1) continue;
-
-      // process the duplicate outputs
-      std::vector<std::string> dup_outputs;
-      for (size_t i = 0; i < dup_op.size(); ++i) {
-        // rename each duplicate output to an alias
-        auto op_offset = dup_op[i];
-        dup_outputs.push_back(name + "@RENAME@" + std::to_string(uid) + "@" +
-                              std::to_string(i));
-        net->ops_[op_offset]->Rename(name, dup_outputs.back());
-      }
-      // collect all the offset for each alias,
-      // insert a sum operator to add all aliases to output
-      insert_position.push_back(
-          {dup_op.back(),
-           OpRegistry::CreateOp("sum", {{"X", dup_outputs}}, {{"Out", {name}}},
-                                AttributeMap{})});
-    }
-
-    // make sure the inserted `sum` ops follow the BFS order.
-    insert_position.sort(
-        [](const Pos& l, const Pos& r) { return l.first > r.first; });
-
-    for (auto& pos : insert_position) {
-      net->InsertOp(pos.first + 1, std::move(pos.second));
-    }
-  } else {
-    std::unique_ptr<OperatorBase> grad_op(
-        CreateGradOp(forwardOp, no_grad_names, grad_to_var));
-
-    ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op](
-                                          const std::string& grad_input) {
-      if (no_grad_names.count(grad_input)) {
-        // +1 for \0
-        std::string prefix = grad_input.substr(
-            0, grad_input.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
-        grad_op->Rename(grad_input, prefix + kZeroVarSuffix);
-
-        // If part of input gradient of that operator is not calculated, fill
-        // zero variables to that input gradient.
-        net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}},
-                                           {{"Out", {grad_input}}},
-                                           AttributeMap{}));
-      }
-      return false;
-    });
-
-    ForEachVarName(grad_op->Outputs(),
-                   [&no_grad_names, &grad_op](const std::string& grad_output) {
-                     if (no_grad_names.count(grad_output)) {
-                       grad_op->Rename(grad_output, kEmptyVarName);
-                     }
-                     return false;
-                   });
-
-    if (net->ops_.empty()) {  // Current no aux op is added to network
-      return grad_op;
-    }
-    net->AppendOp(std::move(grad_op));
-  }
-  net->SetType("@GENERATED_BACKWARD@");
-  net->CompleteAddOp();
-  return std::unique_ptr<OperatorBase>(
-      static_cast<OperatorBase*>(net.release()));
-}
-
-// See header for comments
-std::unique_ptr<OperatorBase> Backward(
-    const OperatorBase& forwardOp,
-    const std::unordered_set<std::string>& no_grad_vars) {
-  std::unordered_set<std::string> no_grad_names;
-  no_grad_names.reserve(no_grad_vars.size() + 1);
-
-  no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
-
-  for (auto& name : no_grad_vars) {
-    no_grad_names.insert(name + kGradVarSuffix);
-  }
-  size_t uid = 0;
-  std::unordered_map<std::string, std::string> grad_to_var;
-  return BackwardRecursive(forwardOp, no_grad_names, &grad_to_var, uid);
-}
-
-// ====================================  //
-
-static bool AllGradInSet(const std::vector<std::string>& names,
-                         const std::unordered_set<std::string>& set) {
-  for (const std::string& name : names) {
-    if (!set.count(GradVarName(name))) {
-      return false;
-    }
-  }
-  if (VLOG_IS_ON(10)) {
-    std::ostringstream sout;
-    sout << "All input {";
-    for (auto& name : names) {
-      sout << name << ",";
-    }
-    sout << "} is in {";
-    for (auto& name : set) {
-      sout << name << ",";
-    }
-    sout << "}";
-    VLOG(10) << sout.str();
-  }
-  return true;
-}
-
-static std::string FwdName(const std::string& grad_name) {
-  auto pos = grad_name.find("@GRAD");
-  if (pos == std::string::npos) {
-    return "";
-  } else {
-    return grad_name.substr(0, pos);
-  }
-}
-
-static void CreateGradVarInBlock(
-    size_t grad_op_start_index,
-    const std::unordered_map<std::string, std::string>& param_name_map,
-    BlockDesc* block_desc,
-    std::unordered_map<std::string, GradVarInfo>* grad_var_record) {
-  auto ops = block_desc->AllOps();
-  for (size_t op_index = grad_op_start_index; op_index < ops.size();
-       ++op_index) {
-    std::unordered_set<std::string> new_vars;
-    auto& ctrl_flow_ops = CtrlFlowOps();
-    ForEachVarName(ops[op_index]->Outputs(),
-                   [&](const std::string& grad_var_name) {
-                     if (ctrl_flow_ops.find(ops[op_index]->Type()) !=
-                         ctrl_flow_ops.end()) {
-                       if (block_desc->HasVarRecursive(grad_var_name)) {
-                         return false;
-                       }
-                     } else {
-                       if (block_desc->HasVar(grad_var_name)) {
-                         return false;
-                       }
-                     }
-                     if (grad_var_name == framework::kEmptyVarName) {
-                       return false;
-                     }
-                     auto var = block_desc->Var(grad_var_name);
-                     VLOG(10) << "Creating Variable " << grad_var_name;
-                     new_vars.insert(var->Name());
-                     auto it = param_name_map.find(grad_var_name);
-                     if (it == param_name_map.end()) {
-                       return false;
-                     }
-                     auto param_var_name = it->second;
-                     auto& grad_record = (*grad_var_record)[param_var_name];
-                     grad_record.name_ = grad_var_name;
-                     grad_record.block_idx_ = block_desc->ID();
-                     grad_record.op_idx_ = static_cast<int>(op_index);
-                     return false; /* not break */
-                   });
-    ops[op_index]->InferVarType(block_desc);
-    for (auto& arg : ops[op_index]->OutputArgumentNames()) {
-      if (new_vars.find(arg) == new_vars.end()) {
-        continue;
-      }
-      auto pname = FwdName(arg);
-      auto* param = block_desc->FindVarRecursive(pname);
-      auto* grad = block_desc->FindVar(arg);
-      if (param == nullptr) {
-        grad->SetDataType(proto::DataType::FP32);
-      } else {
-        grad->SetDataType(param->GetDataType());
-      }
-    }
-    ops[op_index]->InferShape(*block_desc);
-  }
-}
-
-std::vector<std::unique_ptr<OpDesc>> MakeOpGrad(
-    const OpDesc* op_desc, std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_map<std::string, std::string>* grad_to_var,
-    const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>()) {
-  std::vector<std::unique_ptr<OpDesc>> grad_op_descs;
-  // All input gradients of forwarding operator do not need to calculate.
-  const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
-  if (AllGradInSet(inputs, *no_grad_vars)) {
-    VLOG(10) << "Drop operator  " << op_desc->Type();
-    return grad_op_descs;  // empty vector
-  }
-
-  // All output gradients of forwarding operator do not need to calculate.
-  const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
-
-  if (AllGradInSet(outputs, *no_grad_vars)) {
-    VLOG(10) << "Drop operator " << op_desc->Type();
-    // FIXME: Hack code here
-    auto& ctrl_flow_ops = CtrlFlowOps();
-    if (ctrl_flow_ops.find(op_desc->Type()) == ctrl_flow_ops.end()) {
-      // Only computational op need drop input's gradient.
-      for (const std::string& name : inputs) {
-        no_grad_vars->insert(GradVarName(name));
-        VLOG(10) << " Also drop " << GradVarName(name);
-      }
-    }
-
-    return grad_op_descs;  // empty vector
-  }
-
-  grad_op_descs =
-      OpInfoMap::Instance()
-          .Get(op_desc->Type())
-          .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var, grad_block);
-
-  std::list<std::unique_ptr<OpDesc>> pending_fill_zeros_ops;
-  for (auto& desc : grad_op_descs) {
-    for (const std::string& in_name : desc->InputArgumentNames()) {
-      if (no_grad_vars->count(in_name)) {
-        std::string prefix = in_name.substr(
-            0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
-        std::string new_name = prefix + kZeroVarSuffix;
-        desc->Rename(in_name, new_name);
-        std::unique_ptr<OpDesc> fill_zeros_op(
-            new OpDesc("fill_zeros_like", {{"X", {prefix}}},
-                       {{"Out", {new_name}}}, AttributeMap{}));
-        pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
-      }
-    }
-  }
-
-  for (auto& p : pending_fill_zeros_ops) {
-    grad_op_descs.insert(grad_op_descs.begin(), std::move(p));
-  }
-  return grad_op_descs;
-}
-
-static BlockDesc* CreateStepBlock(
-    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_map<std::string, std::string>* grad_to_var,
-    int step_block_idx);
-
-std::vector<std::unique_ptr<OpDesc>> MakeBlockBackward(
-    ProgramDesc& program_desc, int block_idx,
-    std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_map<std::string, std::string>* grad_to_var) {
-  VLOG(5) << "MakeBlockBackward";
-  BlockDesc* cur_block = program_desc.MutableBlock(block_idx);
-  std::vector<OpDesc*> op_descs = cur_block->AllOps();
-  std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
-  size_t grad_desc_idx = 0;
-  std::vector<std::unique_ptr<OpDesc>> backward_descs;
-
-  for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
-    VLOG(5) << "Making backward " << (*it)->Type() << " op";
-    std::vector<std::unique_ptr<OpDesc>> op_grads;
-
-    if ((*it)->Type() == "recurrent" || (*it)->Type() == "while" ||
-        (*it)->Type() == "parallel_do") {
-      int step_block_idx = (*it)->GetBlockAttr("sub_block");
-      BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars,
-                                                  grad_to_var, step_block_idx);
-      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
-    } else if ((*it)->Type() == "conditional_block") {
-      BlockDesc* backward_block =
-          CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
-                          (*it)->GetBlockAttr("sub_block"));
-      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
-    } else {
-      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
-    }
-
-    if (VLOG_IS_ON(10)) {
-      std::ostringstream sout;
-      sout << "Made ";
-      for (auto& op_grad : op_grads) {
-        sout << op_grad->Type() << " ";
-      }
-      VLOG(10) << sout.str();
-    }
-
-    for (const auto& desc : op_grads) {
-      for (const std::string& out_name : desc->OutputArgumentNames()) {
-        if (out_name.find("@GRAD") == std::string::npos) {
-          // Not all outputs of a backward operator is a gradient. Only gradient
-          // need to be sum. Skip variables are not gradient.
-          continue;
-        }
-        dup_out_ops[out_name].emplace_back(grad_desc_idx);
-      }
-      ++grad_desc_idx;
-    }
-    std::transform(op_grads.begin(), op_grads.end(),
-                   std::back_inserter(backward_descs),
-                   [](std::unique_ptr<OpDesc>& ptr) { return std::move(ptr); });
-  }
-
-  VLOG(5) << "Appending Sums";
-  // Check whether some variables are written more than once
-  std::list<std::pair<size_t, std::unique_ptr<OpDesc>>> pending_sum_ops;
-  for (const auto& dup : dup_out_ops) {
-    const std::string& out_name = dup.first;
-    const std::vector<size_t> dup_op = dup.second;
-    if (out_name != kEmptyVarName && dup_op.size() > 1) {
-      std::vector<std::string> sum_op_inputs;
-      std::string next_g_name = out_name;
-      for (size_t i = 0; i < dup_op.size(); ++i) {
-        VLOG(10) << backward_descs[dup_op[i]]->Type() << " has " << out_name
-                 << " duplicated";
-        std::string new_name = out_name + "@RENAME@" + std::to_string(i);
-        backward_descs[dup_op[i]]->RenameOutput(out_name, new_name);
-        backward_descs[dup_op[i]]->RenameInput(out_name, next_g_name);
-        sum_op_inputs.emplace_back(new_name);
-        next_g_name = sum_op_inputs.back();
-      }
-      std::unique_ptr<OpDesc> sum_op(new OpDesc("sum", {{"X", sum_op_inputs}},
-                                                {{"Out", {out_name}}},
-                                                AttributeMap{}));
-      pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
-    }
-  }
-
-  pending_sum_ops.sort([](const std::pair<size_t, std::unique_ptr<OpDesc>>& a,
-                          const std::pair<size_t, std::unique_ptr<OpDesc>>& b) {
-    return a.first > b.first;
-  });
-  for (auto& p : pending_sum_ops) {
-    backward_descs.insert(backward_descs.begin() + p.first + 1,
-                          std::move(p.second));
-  }
-
-  VLOG(5) << "MakeBlockBackward Finished";
-
-  return backward_descs;
-}
-
-static BlockDesc* CreateStepBlock(
-    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_map<std::string, std::string>* grad_to_var,
-    int step_block_idx) {
-  auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
-                                                   no_grad_vars, grad_to_var);
-  BlockDesc* backward_block =
-      program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
-  for (auto& ptr : backward_block_op_descs) {
-    backward_block->AppendAllocatedOp(move(ptr));
-  }
-  return backward_block;
-}
-
-ParamGradInfoMap AppendBackward(
-    ProgramDesc& program_desc, const VarDesc& target,
-    const std::unordered_set<std::string>& no_grad_vars) {
-  std::unordered_set<std::string> no_grad_var_names;
-  no_grad_var_names.reserve(no_grad_vars.size() + 1);
-  no_grad_var_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
-  for (auto& name : no_grad_vars) {
-    no_grad_var_names.insert(GradVarName(name));
-  }
-
-  const int root_block_idx = 0;
-  auto root_block = program_desc.MutableBlock(root_block_idx);
-
-  std::string fill_one_op_out = GradVarName(target.Name());
-  bool is_scalar = target.GetShape() == std::vector<int64_t>{1};
-  PADDLE_ENFORCE(is_scalar, "target should be scalar");
-  VLOG(3) << "backward from loss=" << target.Name()
-          << " data_type=" << target.GetDataType();
-  std::unique_ptr<OpDesc> fill_one_op(
-      new OpDesc("fill_constant", {}, {{"Out", {fill_one_op_out}}},
-                 {{"shape", std::vector<int>{1}},
-                  {"value", static_cast<float>(1.0)},
-                  {"dtype", target.GetDataType()}}));
-  // infer var type of fill_one_op
-  fill_one_op->InferVarType(root_block);
-
-  root_block->AppendAllocatedOp(std::move(fill_one_op));
-  size_t forward_op_num = root_block->OpSize();
-  size_t forward_block_num = program_desc.Size();
-
-  // Insert backward operators
-  std::unordered_map<std::string, std::string> grad_to_var;
-  auto backward_op_descs = MakeBlockBackward(program_desc, root_block_idx,
-                                             &no_grad_var_names, &grad_to_var);
-
-  for (auto& ptr : backward_op_descs) {
-    root_block->AppendAllocatedOp(std::move(ptr));
-  }
-  // Create Variable
-
-  // Create target gradient variable
-  std::unordered_map<std::string, GradVarInfo> retv;
-
-  auto var = root_block->Var(fill_one_op_out);
-  var->SetDataType(target.GetDataType());
-  var->SetShape(target.GetShape());
-  auto& target_grad = retv[target.Name()];
-  target_grad.name_ = fill_one_op_out;
-  target_grad.block_idx_ = root_block_idx;
-  target_grad.op_idx_ = static_cast<int>(forward_op_num);
-
-  // create grad_var for all blocks in this program
-  CreateGradVarInBlock(forward_op_num, grad_to_var, root_block, &retv);
-  for (size_t block_index = forward_block_num;
-       block_index < program_desc.Size(); ++block_index) {
-    CreateGradVarInBlock(0, grad_to_var, program_desc.MutableBlock(block_index),
-                         &retv);
-  }
-  return retv;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/backward.h b/paddle/framework/backward.h
deleted file mode 100644
index 69ee3802369c16a8b21c0710d2008ef3c085cc5c..0000000000000000000000000000000000000000
--- a/paddle/framework/backward.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "paddle/framework/operator.h"
-#include "paddle/framework/program_desc.h"
-
-namespace paddle {
-namespace framework {
-
-// Create the backward operator from a forward operator.
-// TODO(yuyang18): Add more API reference comment.
-extern std::unique_ptr<OperatorBase> Backward(
-    const OperatorBase& forwardOp,
-    const std::unordered_set<std::string>& no_grad_vars);
-
-struct GradVarInfo {
-  GradVarInfo() {}
-  GradVarInfo(const std::string& name, int block_idx, int op_idx)
-      : name_(name), block_idx_(block_idx), op_idx_(op_idx) {}
-
-  bool operator==(const GradVarInfo& b) const {
-    return name_ == b.name_ && block_idx_ == b.block_idx_ &&
-           op_idx_ == b.op_idx_;
-  }
-
-  std::string name_;
-  int block_idx_;
-  int op_idx_;
-};
-
-using ParamGradInfoMap = std::unordered_map<std::string /*fwd_var_name*/,
-                                            GradVarInfo /*grad_var_info*/>;
-
-ParamGradInfoMap AppendBackward(
-    ProgramDesc& program_desc, const VarDesc& target,
-    const std::unordered_set<std::string>& no_grad_vars);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
deleted file mode 100644
index 72743b5fd0b32479ccbf28fbf98032df8fa371e9..0000000000000000000000000000000000000000
--- a/paddle/framework/backward_test.cc
+++ /dev/null
@@ -1,918 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/framework/backward.h"
-
-#include <gtest/gtest.h>
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/var_desc.h"
-#include "paddle/operators/net_op.h"
-
-USE_NO_KERNEL_OP(fill_constant);
-
-namespace paddle {
-namespace framework {
-
-using DeviceContext = platform::DeviceContext;
-
-class NoneOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {}
-};
-
-template <typename Place, typename T>
-class NoneKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {}
-};
-
-class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
- public:
-  RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input X of Add");
-    AddInput("b", "Bias of Add");
-    AddOutput("Out", "Out of Add");
-    AddComment("Add Op");
-  }
-};
-
-class RowWiseAddGradMaker : public SingleGradOpDescMaker {
- public:
-  using SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<OpDesc> Apply() const override {
-    auto grad_op = new OpDesc();
-    grad_op->SetInput(GradVarName("Out"), OutputGrad("Out"));
-    grad_op->SetOutput(GradVarName("X"), InputGrad("X"));
-    grad_op->SetOutput(GradVarName("b"), InputGrad("b"));
-    grad_op->SetType("rowwise_add_grad");
-    return std::unique_ptr<OpDesc>(grad_op);
-  }
-};
-
-class MulOpMaker : public OpProtoAndCheckerMaker {
- public:
-  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "A");
-    AddInput("Y", "B");
-    AddOutput("Out", "Out");
-    AddAttr<int>("x_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
-    AddAttr<int>("y_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
-    AddComment("Mul");
-  }
-};
-
-class SigmoidOpMaker : public OpProtoAndCheckerMaker {
- public:
-  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "X");
-    AddOutput("Out", "Y");
-    AddComment("Sigmoid");
-  }
-};
-
-class NoGradOpMaker : public OpProtoAndCheckerMaker {
- public:
-  NoGradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "X input");
-    AddOutput("Out", "Y output");
-    AddComment("NoGradOp, same input output. no Grad");
-  }
-};
-
-class FcOp : public operators::NetOp {
- public:
-  FcOp(const std::string &type, const VariableNameMap &inputs,
-       const VariableNameMap &outputs, const AttributeMap &attrs)
-      : NetOp(type, inputs, outputs, attrs) {
-    AppendOp(OpRegistry::CreateOp(
-        "mul", {{"X", {Input("X")}}, {"Y", {Input("W")}}},
-        {{"Out", {Output("mul_result")}}}, AttributeMap{}));
-    auto input_b = Inputs("b");
-    std::string before_act = "mul_result";
-    if (input_b.size() != 0) {
-      AppendOp(OpRegistry::CreateOp(
-          "rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}},
-          {{"Out", {Output("add_result")}}}, AttributeMap{}));
-      before_act = "add_result";
-    } else {
-      auto out_varname = Output("add_result");
-      if (out_varname != kEmptyVarName) {
-        this->Rename(out_varname, kEmptyVarName);
-      }
-    }
-
-    AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
-                                  {{"Out", {Output("Out")}}}, AttributeMap{}));
-    CompleteAddOp(false);
-  }
-};
-
-class FcOpMaker : public OpProtoAndCheckerMaker {
- public:
-  FcOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "x");
-    AddInput("W", "w");
-    AddInput("b", "b");
-    AddOutput("mul_result", "").AsIntermediate();
-    AddOutput("add_result", "").AsIntermediate();
-    AddOutput("Out", "");
-    AddComment("");
-  }
-};
-
-class ManyOutputOpMaker : public OpProtoAndCheckerMaker {
- public:
-  ManyOutputOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("x", "x");
-    AddOutput("y", "y");
-    AddOutput("z", "z");
-    AddComment("");
-  }
-};
-
-class FillZeroOpMaker : public OpProtoAndCheckerMaker {
- public:
-  FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "x");
-    AddOutput("Out", "out");
-    AddComment("");
-  }
-};
-
-class SumOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "the input tensors of sum operator.").AsDuplicable();
-    AddOutput("Out", "the output tensor of sum operator.");
-    AddComment("");
-  }
-};
-
-class MultInOutOpMaker : public OpProtoAndCheckerMaker {
- public:
-  MultInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "x");
-    AddInput("H", "h");
-    AddOutput("Y", "y");
-    AddOutput("Z", "z");
-    AddComment("");
-  }
-};
-
-class MinusGradOpDescMaker : public GradOpDescMakerBase {
- public:
-  using GradOpDescMakerBase::GradOpDescMakerBase;
-
-  std::vector<std::unique_ptr<OpDesc>> operator()() const override {
-    std::vector<std::unique_ptr<OpDesc>> retv;
-    auto x_g = InputGrad("X");
-    if (!x_g.empty()) {
-      auto *op_desc = new OpDesc();
-      op_desc->SetType("scale");
-      op_desc->SetInput("X", OutputGrad("Out"));
-      op_desc->SetOutput("Out", x_g);
-      op_desc->SetAttr("scale", 1.0f);
-      retv.emplace_back(op_desc);
-    }
-
-    auto y_g = InputGrad("Y");
-    if (!y_g.empty()) {
-      auto *op_desc = new OpDesc();
-      op_desc->SetType("scale");
-      op_desc->SetInput("X", OutputGrad("Out"));
-      op_desc->SetOutput("Out", y_g);
-      op_desc->SetAttr("scale", -1.0f);
-      retv.emplace_back(op_desc);
-    }
-    return retv;
-  }
-};
-
-class MinusOpMaker : public OpProtoAndCheckerMaker {
- public:
-  MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "");
-    AddInput("Y", "");
-    AddOutput("Out", "");
-    AddComment("minus for unittest");
-  }
-};
-}  // namespace framework
-}  // namespace paddle
-
-namespace f = paddle::framework;
-namespace ops = paddle::operators;
-using EnforceNotMet = paddle::platform::EnforceNotMet;
-// rowwise_add
-REGISTER_OPERATOR(rowwise_add, f::NoneOp, f::RowWiseAddOpMaker,
-                  f::RowWiseAddGradMaker);
-REGISTER_OP_CPU_KERNEL(rowwise_add,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OPERATOR(rowwise_add_grad, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(rowwise_add_grad,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// mul
-REGISTER_OP(mul, f::NoneOp, f::MulOpMaker, mul_grad, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(mul, f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(mul_grad,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// sigmoid
-REGISTER_OP(sigmoid, f::NoneOp, f::SigmoidOpMaker, sigmoid_grad, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(sigmoid,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NoneOp, f::NoGradOpMaker);
-// fill_zeros_like
-REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NoneOp, f::FillZeroOpMaker);
-REGISTER_OP_CPU_KERNEL(fill_zeros_like,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// sum
-REGISTER_OP(sum, f::NoneOp, f::SumOpMaker, sum_grad, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(sum, f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(sum_grad,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// fc
-REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
-// many_output_op
-REGISTER_OP(many_output_op, f::NoneOp, f::ManyOutputOpMaker,
-            many_output_op_grad, f::NoneOp);
-// mult_in_out
-REGISTER_OP(mult_in_out, f::NoneOp, f::MultInOutOpMaker, mult_in_out_grad,
-            f::NoneOp);
-REGISTER_OP_CPU_KERNEL(mult_in_out,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(mult_in_out_grad,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// minus
-REGISTER_OPERATOR(minus, f::NoneOp, f::MinusOpMaker, f::MinusGradOpDescMaker);
-REGISTER_OP_CPU_KERNEL(minus, f::NoneKernel<paddle::platform::CPUPlace, float>);
-// scale
-REGISTER_OPERATOR(scale, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(scale, f::NoneKernel<paddle::platform::CPUPlace, float>);
-
-TEST(Backward, simple_op_not_need_grad) {
-  auto fwd =
-      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
-                              {{"Out", {"out"}}}, f::AttributeMap{});
-  ASSERT_NE(fwd, nullptr);
-  auto gop = f::Backward(*fwd, {"x"});
-  ASSERT_EQ(gop->Output(f::GradVarName("X")), f::kEmptyVarName);
-
-  auto no_input_gop = f::Backward(*fwd, {"x", "b"});
-  ASSERT_NE(no_input_gop, nullptr);
-  ASSERT_TRUE(no_input_gop->IsNetOp());
-  ASSERT_EQ(0UL, static_cast<ops::NetOp *>(no_input_gop.get())->ops_.size());
-}
-
-TEST(Backward, net_fc_backward_normal) {
-  std::shared_ptr<f::OperatorBase> fwd =
-      f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {"b"}}},
-                              {{"mul_result", {"mul_res"}},
-                               {"add_result", {"add_re"}},
-                               {"Out", {"out"}}},
-                              f::AttributeMap{});
-  ASSERT_NE(fwd, nullptr);
-  std::shared_ptr<f::OperatorBase> gop =
-      f::Backward(*fwd, std::unordered_set<std::string>{});
-  ASSERT_TRUE(gop->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(gop.get());
-
-  ASSERT_NO_THROW(net->DebugString());
-
-  ASSERT_EQ(3UL, net->ops_.size());
-
-  f::OperatorBase &d_sigmoid = *net->ops_[0];
-  ASSERT_EQ("sigmoid_grad", d_sigmoid.Type());
-
-  f::OperatorBase &d_add = *net->ops_[1];
-  ASSERT_EQ("rowwise_add_grad", d_add.Type());
-
-  f::OperatorBase &d_mul = *net->ops_[2];
-  ASSERT_EQ("mul_grad", d_mul.Type());
-}
-
-TEST(Backward, net_fc_backward_not_have_b) {
-  std::shared_ptr<f::OperatorBase> fwd =
-      f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {}}},
-                              {{"mul_result", {"mul_res"}},
-                               {"add_result", {"add_res"}},
-                               {"Out", {"tmp"}}},
-                              f::AttributeMap{});
-  ASSERT_NE(fwd, nullptr);
-  std::shared_ptr<f::OperatorBase> gop =
-      f::Backward(*fwd, std::unordered_set<std::string>{});
-  ASSERT_TRUE(gop->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(gop.get());
-
-  ASSERT_NO_THROW(net->DebugString());
-
-  ASSERT_EQ(2UL, net->ops_.size());
-
-  f::OperatorBase &d_sigmoid = *net->ops_[0];
-  ASSERT_EQ("sigmoid_grad", d_sigmoid.Type());
-
-  f::OperatorBase &d_mul = *net->ops_[1];
-  ASSERT_EQ("mul_grad", d_mul.Type());
-}
-
-TEST(Backward, net_input_of_network_not_need_grad) {
-  ops::NetOp net;
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"x"}}, {"W", {"W1"}}, {"b", {"b1"}}},
-      {{"mul_result", {"mul_tmp_0"}},
-       {"add_result", {"add_tmp_0"}},
-       {"Out", {"hidden0"}}},
-      f::AttributeMap{}));
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}},
-      {{"mul_result", {"mul_tmp_1"}},
-       {"add_result", {"add_tmp_1"}},
-       {"Out", {"hidden1"}}},
-      f::AttributeMap{}));
-  net.CompleteAddOp();
-  auto bwd = Backward(net, {"x"});  // x@GRAD is not need.
-  ASSERT_TRUE(bwd->IsNetOp());
-  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
-
-  auto output_vars = bwd_net->OutputVars(true);
-  std::unordered_set<std::string> all_outputs =
-      std::unordered_set<std::string>(output_vars.begin(), output_vars.end());
-  all_outputs.erase(f::kEmptyVarName);
-
-  for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) {
-    ASSERT_NE(all_outputs.find(f::GradVarName(out)), all_outputs.end());
-  }
-
-  // Not Generated X
-  ASSERT_EQ(all_outputs.find(f::GradVarName("X")), all_outputs.end());
-
-  ASSERT_EQ(2UL, bwd_net->ops_.size());
-  ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
-  auto first_fc_grad = static_cast<ops::NetOp *>(bwd_net->ops_[1].get());
-  ASSERT_EQ(3UL, first_fc_grad->ops_.size());
-  ASSERT_EQ(f::kEmptyVarName,
-            first_fc_grad->ops_[2]->Output(f::GradVarName("X")));
-}
-
-TEST(Backward, net_shared_weight) {
-  ops::NetOp net;
-  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
-                                       {{"Out", {"out"}}}, f::AttributeMap{}));
-  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
-                                       {{"Out", {"FinalOut"}}},
-                                       f::AttributeMap{}));
-  net.CompleteAddOp();
-
-  auto bwd = f::Backward(net, std::unordered_set<std::string>{});
-  ASSERT_TRUE(bwd->IsNetOp());
-  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
-  ASSERT_EQ(3UL, bwd_net->ops_.size());
-  ASSERT_EQ("sum", bwd_net->ops_[2]->Type());
-}
-
-TEST(Backward, op_all_input_are_not_need) {
-  auto fwd =
-      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
-                              {{"Out", {"out"}}}, f::AttributeMap{});
-  auto backward = f::Backward(*fwd, {"x", "b"});
-  ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(backward.get());
-  ASSERT_TRUE(net->ops_.empty());
-}
-
-TEST(Backward, op_all_output_are_not_need) {
-  auto fwd =
-      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
-                              {{"Out", {"out"}}}, f::AttributeMap{});
-  auto backward = f::Backward(*fwd, {"out"});
-  ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(backward.get());
-  ASSERT_TRUE(net->ops_.empty());
-}
-
-TEST(Backward, op_part_of_output_are_not_need) {
-  auto fwd =
-      f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}},
-                              {{"y", {"Y"}}, {"z", {"Z"}}}, f::AttributeMap{});
-  auto backward = f::Backward(*fwd, {"Z"});
-  ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(backward.get());
-  ASSERT_EQ(net->ops_.size(), 2UL);
-
-  auto &fill_zero = *net->ops_[0];
-  ASSERT_EQ("fill_zeros_like", fill_zero.Type());
-  ASSERT_EQ(1UL, fill_zero.Inputs("X").size());
-  ASSERT_EQ("Z", fill_zero.Input("X"));
-  ASSERT_EQ(1UL, fill_zero.Outputs("Out").size());
-  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Out"));
-
-  auto &d_many_out = *net->ops_[1];
-  ASSERT_EQ("many_output_op_grad", d_many_out.Type());
-  ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.Inputs().size());  // I/O/OG
-  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix,
-            d_many_out.Input(f::GradVarName("z")));
-  ASSERT_EQ(f::GradVarName("Y"), d_many_out.Input(f::GradVarName("y")));
-  ASSERT_EQ(f::GradVarName("X"), d_many_out.Output(f::GradVarName("x")));
-}
-
-TEST(Backward, op_part_of_input_are_not_need) {
-  auto fwd = f::OpRegistry::CreateOp("mul", {{"X", {"a"}}, {"Y", {"b"}}},
-                                     {{"Out", {"out"}}}, f::AttributeMap{});
-  auto backward = f::Backward(*fwd, {"a"});
-  auto &grad_mul = *backward;
-  ASSERT_EQ(grad_mul.Type(), "mul_grad");
-  ASSERT_EQ(grad_mul.Inputs().size(), 2UL + 1UL + 1UL);
-  ASSERT_EQ(grad_mul.Outputs().size(), 2UL);
-  ASSERT_EQ(grad_mul.Output(f::GradVarName("X")), f::kEmptyVarName);
-  ASSERT_EQ(grad_mul.Output(f::GradVarName("Y")), f::GradVarName("b"));
-  ASSERT_EQ(grad_mul.Input(f::GradVarName("Out")), f::GradVarName("out"));
-  ASSERT_EQ(grad_mul.Input("X"), "a");
-  ASSERT_EQ(grad_mul.Input("Y"), "b");
-  ASSERT_EQ(grad_mul.Input("Out"), "out");
-}
-
-TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
-  ops::NetOp net;
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"x1"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-      {{"mul_result", {"mul_out1"}},
-       {"add_result", {"add_out1"}},
-       {"Out", {"out1"}}},
-      f::AttributeMap{}));
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}},
-      {{"mul_result", {"mul_out2"}},
-       {"add_result", {"tmp_out2"}},
-       {"Out", {"out2"}}},
-      f::AttributeMap{}));
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}},
-      {{"mul_result", {"mul_out3"}},
-       {"add_result", {"tmp_out3"}},
-       {"Out", {"out3"}}},
-      f::AttributeMap{}));
-  net.CompleteAddOp();
-
-  auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"});
-  ASSERT_TRUE(backward->IsNetOp());
-  auto bwd_net = static_cast<ops::NetOp *>(backward.get());
-  ASSERT_EQ(bwd_net->ops_.size(), 3UL);
-  auto &grad_fc = *bwd_net->ops_[0];
-
-  const char *all = paddle::operators::NetOp::kAll;
-  EXPECT_EQ(grad_fc.Inputs(all).size(),
-            2UL       /* external input number */
-                + 1UL /* external output number*/
-                + 1UL /* number of gradient of external output*/
-                + 2UL /* internal variable number*/
-            );
-  EXPECT_EQ(grad_fc.Outputs(all).size(),
-            2UL       /* input number of mul*/
-                + 2UL /* input number of rowwise_add*/
-                + 1UL /* input number of sigmod */
-                - 1UL /* out2 is not needed*/);
-  EXPECT_EQ(bwd_net->ops_[1]->Inputs(all).size(), 0UL);
-  EXPECT_EQ(bwd_net->ops_[1]->Outputs(all).size(), 0UL);
-  EXPECT_EQ(bwd_net->ops_[2]->Inputs(all).size(), 0UL);
-  EXPECT_EQ(bwd_net->ops_[2]->Outputs(all).size(), 0UL);
-}
-
-TEST(Backward, simple_single_op) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-
-  f::OpDesc *op = block->AppendOp();
-  op->SetType("rowwise_add");
-  op->SetInput("X", {"x"});
-  op->SetInput("b", {"b"});
-  op->SetOutput("Out", {"out"});
-
-  auto target = f::VarDesc("out");
-  target.SetShape({1});
-  auto var_to_grad =
-      AppendBackward(program, target, std::unordered_set<std::string>{});
-
-  ASSERT_EQ(block->AllOps().size(), 3UL);
-  f::OpDesc *fill_op = block->AllOps()[1];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op = block->AllOps()[2];
-  EXPECT_EQ(grad_op->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out")}));
-  EXPECT_EQ(grad_op->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x")}));
-  EXPECT_EQ(grad_op->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b")}));
-
-  EXPECT_EQ(var_to_grad.size(), 3UL);
-  EXPECT_EQ(var_to_grad.at("b"), f::GradVarInfo(f::GradVarName("b"), 0, 2));
-  EXPECT_EQ(var_to_grad.at("x"), f::GradVarInfo(f::GradVarName("x"), 0, 2));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x")));
-}
-
-TEST(Backward, default_attribute) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op = block->AppendOp();
-  op->SetType("mul");
-  op->SetInput("X", {"x"});
-  op->SetInput("Y", {"y"});
-  op->SetOutput("Out", {"out"});
-  op->CheckAttrs();
-
-  auto target = f::VarDesc("out");
-  target.SetShape({1});
-  AppendBackward(program, target, std::unordered_set<std::string>{});
-
-  ASSERT_EQ(block->AllOps().size(), 3UL);
-  EXPECT_EQ(boost::get<int>(op->GetAttr("x_num_col_dims")), 1);
-  EXPECT_EQ(boost::get<int>(op->GetAttr("y_num_col_dims")), 1);
-
-  f::OpDesc *fill_op = block->AllOps()[1];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op = block->AllOps()[2];
-  ASSERT_EQ(grad_op->Type(), "mul_grad");
-  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("x_num_col_dims")), 1);
-  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("y_num_col_dims")), 1);
-}
-
-TEST(Backward, simple_mult_op) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op1 = block->AppendOp();
-  op1->SetType("rowwise_add");
-  op1->SetInput("X", {"x1"});
-  op1->SetInput("b", {"b1"});
-  op1->SetOutput("Out", {"out1"});
-
-  f::OpDesc *op2 = block->AppendOp();
-  op2->SetType("mul");
-  op2->SetInput("X", {"out1"});
-  op2->SetInput("Y", {"y2"});
-  op2->SetOutput("Out", {"out2"});
-
-  f::OpDesc *op3 = block->AppendOp();
-  op3->SetType("rowwise_add");
-  op3->SetInput("X", {"out2"});
-  op3->SetInput("b", {"b3"});
-  op3->SetOutput("Out", {"out3"});
-
-  auto target = f::VarDesc("out3");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad =
-      AppendBackward(program, target, std::unordered_set<std::string>{});
-
-  ASSERT_EQ(block->AllOps().size(), 6UL + 1);
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op1 = block->AllOps()[6];
-  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b1")}));
-
-  f::OpDesc *grad_op2 = block->AllOps()[5];
-  EXPECT_EQ(grad_op2->Type(), "mul_grad");
-  ASSERT_EQ(grad_op2->InputNames().size(), 4UL);
-  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"out1"}));
-  EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
-  EXPECT_EQ(grad_op2->Input("Out"), std::vector<std::string>({"out2"}));
-  EXPECT_EQ(grad_op2->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out2")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")),
-            std::vector<std::string>({f::GradVarName("y2")}));
-
-  f::OpDesc *grad_op3 = block->AllOps()[4];
-  EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out3")}));
-  EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out2")}));
-  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b3")}));
-
-  EXPECT_EQ(var_to_grad.size(), 7UL);
-  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("out1"),
-            f::GradVarInfo(f::GradVarName("out1"), 0, 5));
-  EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
-  EXPECT_EQ(var_to_grad.at("out2"),
-            f::GradVarInfo(f::GradVarName("out2"), 0, 4));
-  EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("out2")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
-}
-
-TEST(Backward, intermedia_var_no_grad) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op1 = block->AppendOp();
-  op1->SetType("rowwise_add");
-  op1->SetInput("X", {"x1"});
-  op1->SetInput("b", {"b1"});
-  op1->SetOutput("Out", {"out1"});
-
-  f::OpDesc *op2 = block->AppendOp();
-  op2->SetType("mul");
-  op2->SetInput("X", {"x2"});
-  op2->SetInput("Y", {"y2"});
-  op2->SetOutput("Out", {"out2"});
-
-  f::OpDesc *op3 = block->AppendOp();
-  op3->SetType("rowwise_add");
-  op3->SetInput("X", {"out2"});
-  op3->SetInput("b", {"b3"});
-  op3->SetOutput("Out", {"out3"});
-
-  f::OpDesc *op4 = block->AppendOp();
-  op4->SetType("mul");
-  op4->SetInput("X", {"out1"});
-  op4->SetInput("Y", {"out3"});
-  op4->SetOutput("Out", {"out4"});
-
-  auto target = f::VarDesc("out4");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {"out3"});
-
-  ASSERT_EQ(block->AllOps().size(), 7UL);
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op1 = block->AllOps()[6];
-  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b1")}));
-
-  f::OpDesc *grad_op4 = block->AllOps()[5];
-  EXPECT_EQ(grad_op4->Type(), "mul_grad");
-  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
-  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
-  EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"out3"}));
-  EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out4"}));
-  EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out4")}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), std::vector<std::string>());
-
-  EXPECT_EQ(var_to_grad.size(), 4UL);
-  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("out1"),
-            f::GradVarInfo(f::GradVarName("out1"), 0, 5));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
-}
-
-TEST(Backward, var_no_grad) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op1 = block->AppendOp();
-  op1->SetType("mult_in_out");
-  op1->SetInput("X", {"x1"});
-  op1->SetInput("H", {"h1"});
-  op1->SetOutput("Y", {"y1"});
-  op1->SetOutput("Z", {"z1"});
-
-  f::OpDesc *op2 = block->AppendOp();
-  op2->SetType("mult_in_out");
-  op2->SetInput("X", {"y1"});
-  op2->SetInput("H", {"z1"});
-  op2->SetOutput("Y", {"y2"});
-  op2->SetOutput("Z", {"z2"});
-
-  auto target = f::VarDesc("z2");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {"z1"});
-
-  ASSERT_EQ(block->AllOps().size(), 6UL);
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op2 = block->AllOps()[3];
-  ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad");
-  ASSERT_EQ(grad_op2->InputNames().size(), 6UL);
-  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"y1"}));
-  EXPECT_EQ(grad_op2->Input("H"), std::vector<std::string>({"z1"}));
-  EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
-  EXPECT_EQ(grad_op2->Input("Z"), std::vector<std::string>({"z2"}));
-  EXPECT_EQ(grad_op2->Input(f::GradVarName("Y")),
-            std::vector<std::string>({f::GradVarName("y2")}));
-  EXPECT_EQ(grad_op2->Input(f::GradVarName("Z")),
-            std::vector<std::string>({f::GradVarName("z2")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("y1")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), std::vector<std::string>());
-
-  f::OpDesc *fill_zero_op = block->AllOps()[4];
-  ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like");
-  ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL);
-  ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL);
-  EXPECT_EQ(fill_zero_op->Input("X"), std::vector<std::string>({"z1"}));
-  EXPECT_EQ(fill_zero_op->Output("Out"),
-            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
-
-  f::OpDesc *grad_op1 = block->AllOps()[5];
-  ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad");
-  ASSERT_EQ(grad_op1->InputNames().size(), 6UL);
-  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op1->Input("X"), std::vector<std::string>({"x1"}));
-  EXPECT_EQ(grad_op1->Input("H"), std::vector<std::string>({"h1"}));
-  EXPECT_EQ(grad_op1->Input("Y"), std::vector<std::string>({"y1"}));
-  EXPECT_EQ(grad_op1->Input("Z"), std::vector<std::string>({"z1"}));
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Y")),
-            std::vector<std::string>({f::GradVarName("y1")}));
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Z")),
-            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("H")),
-            std::vector<std::string>({f::GradVarName("h1")}));
-
-  EXPECT_EQ(var_to_grad.size(), 4UL);
-  EXPECT_EQ(var_to_grad.at("y1"), f::GradVarInfo(f::GradVarName("y1"), 0, 3));
-  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 5));
-  EXPECT_EQ(var_to_grad.at("h1"), f::GradVarInfo(f::GradVarName("h1"), 0, 5));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("y1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("h1")));
-}
-
-TEST(Backward, shared_var) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op1 = block->AppendOp();
-  op1->SetType("rowwise_add");
-  op1->SetInput("X", {"x1"});
-  op1->SetInput("b", {"b1"});
-  op1->SetOutput("Out", {"out1"});
-
-  f::OpDesc *op2 = block->AppendOp();
-  op2->SetType("mul");
-  op2->SetInput("X", {"out1"});
-  op2->SetInput("Y", {"y2"});
-  op2->SetOutput("Out", {"out2"});
-
-  f::OpDesc *op3 = block->AppendOp();
-  op3->SetType("rowwise_add");
-  op3->SetInput("X", {"out1"});
-  op3->SetInput("b", {"b3"});
-  op3->SetOutput("Out", {"out3"});
-
-  auto target = f::VarDesc("out3");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad =
-      AppendBackward(program, target, std::unordered_set<std::string>{});
-
-  ASSERT_EQ(block->AllOps().size(), 8UL);
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op3 = block->AllOps()[4];
-  ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out3")}));
-  EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0"}));
-  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b3")}));
-
-  f::OpDesc *grad_op4 = block->AllOps()[5];
-  ASSERT_EQ(grad_op4->Type(), "mul_grad");
-  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
-  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
-  EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"y2"}));
-  EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out2"}));
-  EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out2")}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@1"}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
-            std::vector<std::string>({f::GradVarName("y2")}));
-
-  f::OpDesc *sum_op = block->AllOps()[6];
-  ASSERT_EQ(sum_op->Type(), "sum");
-  ASSERT_EQ(sum_op->InputNames().size(), 1UL);
-  ASSERT_EQ(sum_op->OutputNames().size(), 1UL);
-  EXPECT_EQ(sum_op->Input("X"),
-            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0",
-                                      f::GradVarName("out1") + "@RENAME@1"}));
-  EXPECT_EQ(sum_op->Output("Out"),
-            std::vector<std::string>({f::GradVarName("out1")}));
-
-  f::OpDesc *grad_op1 = block->AllOps()[7];
-  ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b1")}));
-
-  EXPECT_EQ(var_to_grad.size(), 6UL);
-  EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
-  EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
-  EXPECT_EQ(var_to_grad.at("out1"),
-            f::GradVarInfo(f::GradVarName("out1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 7));
-  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 7));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
-}
-
-TEST(Backward, half_backward) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  auto *op1 = block->AppendOp();
-  op1->SetType("minus");
-  op1->SetInput("X", {"a"});
-  op1->SetInput("Y", {"b"});
-  op1->SetOutput("Out", {"out"});
-
-  auto target = f::VarDesc("out");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {"b"});
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-  auto ops = block->AllOps();
-  ASSERT_EQ(3UL, ops.size());
-
-  EXPECT_EQ(var_to_grad.size(), 2UL);
-  EXPECT_EQ(var_to_grad.at("a"),
-            f::GradVarInfo(f::GradVarName("a"), 0, forward_len + 1));
-}
diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
deleted file mode 100644
index 3e344ea3790f57b0f53f36a40263dcdd326e67a9..0000000000000000000000000000000000000000
--- a/paddle/framework/block_desc.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/program_desc.h"
-
-namespace paddle {
-namespace framework {
-
-VarDesc *BlockDesc::Var(const std::string &name) {
-  auto it = vars_.find(name);
-  if (it != vars_.end()) {
-    return it->second.get();
-  }
-  need_update_ = true;
-  auto *var = new VarDesc(name);
-  vars_[name].reset(var);
-  return var;
-}
-
-VarDesc *BlockDesc::FindVar(const std::string &name) const {
-  auto it = vars_.find(name);
-  if (it == vars_.end()) {
-    return nullptr;
-  }
-  return it->second.get();
-}
-
-bool BlockDesc::HasVar(const std::string &name) const {
-  return vars_.find(name) != vars_.end();
-}
-
-VarDesc *BlockDesc::FindVarRecursive(const std::string &name) const {
-  if (name == kEmptyVarName) return nullptr;
-
-  auto it = vars_.find(name);
-  if (it == vars_.end()) {
-    return Parent() == kNoneBlockIndex ? nullptr
-                                       : ParentBlock()->FindVarRecursive(name);
-  }
-  return it->second.get();
-}
-
-VarDesc &BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) {
-  VarDesc *res = FindVarRecursive(name_bytes);
-  if (res == nullptr) {
-    res = Var(name_bytes);
-  }
-  return *res;
-}
-
-bool BlockDesc::HasVarRecursive(const std::string &name) const {
-  return FindVarRecursive(name) != nullptr;
-}
-
-std::vector<VarDesc *> BlockDesc::AllVars() const {
-  std::vector<VarDesc *> res;
-  for (const auto &p : vars_) {
-    res.push_back(p.second.get());
-  }
-  return res;
-}
-
-OpDesc *BlockDesc::AppendOp() {
-  need_update_ = true;
-  ops_.emplace_back(new OpDesc(this));
-  return ops_.back().get();
-}
-
-void BlockDesc::AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc) {
-  need_update_ = true;
-  ops_.emplace_back(std::move(op_desc));
-}
-
-OpDesc *BlockDesc::PrependOp() {
-  need_update_ = true;
-  ops_.emplace_front(new OpDesc(this));
-  return ops_.front().get();
-}
-
-void BlockDesc::RemoveOp(size_t s, size_t e) {
-  if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
-    return;
-  }
-  need_update_ = true;
-  for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) {
-    auto names = (*it)->InputArgumentNames();
-    for (auto n : names) {
-      // TODO(typhoonzero): delete vars if no other op use it.
-      VLOG(3) << "deleting var " << n;
-    }
-  }
-  ops_.erase(ops_.begin() + s, ops_.begin() + e);
-}
-
-std::vector<OpDesc *> BlockDesc::AllOps() const {
-  std::vector<OpDesc *> res;
-  for (const auto &op : ops_) {
-    res.push_back(op.get());
-  }
-  return res;
-}
-
-void BlockDesc::Flush() {
-  for (auto &op_desc : ops_) {
-    op_desc->Flush();
-  }
-
-  if (need_update_) {
-    auto &op_field = *this->desc_->mutable_ops();
-    this->ClearPBOps();
-    op_field.Reserve(static_cast<int>(ops_.size()));
-    for (auto &op_desc : ops_) {
-      op_field.AddAllocated(op_desc->Proto());
-    }
-    auto &var_field = *this->desc_->mutable_vars();
-    this->ClearPBVars();
-    var_field.Reserve(static_cast<int>(vars_.size()));
-    for (auto &var_desc : vars_) {
-      var_field.AddAllocated(var_desc.second->Proto());
-    }
-    need_update_ = false;
-  }
-}
-
-BlockDesc *BlockDesc::ParentBlock() const {
-  if (this->desc_->parent_idx() == kNoneBlockIndex) {
-    return nullptr;
-  }
-  return prog_->MutableBlock(static_cast<size_t>(this->desc_->parent_idx()));
-}
-
-proto::BlockDesc *BlockDesc::Proto() {
-  Flush();
-  return desc_;
-}
-
-BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
-    : prog_(prog), desc_(desc), need_update_(false) {
-  for (const proto::VarDesc &var_desc : desc_->vars()) {
-    vars_[var_desc.name()].reset(new VarDesc(var_desc));
-  }
-  for (const proto::OpDesc &op_desc : desc_->ops()) {
-    ops_.emplace_back(new OpDesc(op_desc, prog, this));
-  }
-}
-
-BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
-                     ProgramDesc *prog)
-    : prog_(prog), desc_(desc) {
-  need_update_ = true;
-  for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDesc(*op->Proto(), prog, this));
-  }
-  for (auto &it : other.vars_) {
-    auto *var = new VarDesc(*it.second);
-    vars_[it.first].reset(var);
-  }
-}
-
-void BlockDesc::ClearPBOps() {
-  auto ops = this->desc_->mutable_ops();
-  while (!ops->empty()) {
-    // we do not own the OpDesc, so release the ownership.
-    ops->ReleaseLast();
-  }
-}
-
-void BlockDesc::ClearPBVars() {
-  auto vars = this->desc_->mutable_vars();
-  while (!vars->empty()) {
-    // we do not own the VarDesc, so release the ownership.
-    vars->ReleaseLast();
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h
deleted file mode 100644
index 4b609e4bcb67bb8dda5924a639e7a8165eda4353..0000000000000000000000000000000000000000
--- a/paddle/framework/block_desc.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <deque>
-#include <memory>
-#include <set>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/proto_desc.h"
-#include "paddle/framework/var_desc.h"
-#include "paddle/platform/macros.h"
-
-namespace paddle {
-namespace framework {
-
-class ProgramDesc;
-
-// Each Protobuf Message, we provide a XXXBind class. In that class, we optimize
-// read/write speed. Only when we want the protobuf message, the local changes
-// will be synchronized (by `Sync` method).
-
-class BlockDesc {
- public:
-  BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc);
-
-  BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog);
-
-  ~BlockDesc() {
-    this->ClearPBVars();
-    this->ClearPBOps();
-  }
-
-  int32_t ID() const { return desc_->idx(); }
-
-  int32_t Parent() const { return desc_->parent_idx(); }
-
-  VarDesc *Var(const std::string &name_bytes);
-
-  VarDesc *FindVar(const std::string &name_bytes) const;
-
-  bool HasVar(const std::string &var_name) const;
-
-  VarDesc *FindVarRecursive(const std::string &name_bytes) const;
-
-  VarDesc &FindRecursiveOrCreateVar(const std::string &name_bytes);
-
-  bool HasVarRecursive(const std::string &var_name) const;
-
-  std::set<std::string> LocalVarNames() const {
-    std::set<std::string> var_names;
-    for (auto &var : vars_) {
-      var_names.insert(var.first);
-    }
-    return var_names;
-  }
-
-  std::vector<VarDesc *> AllVars() const;
-
-  BlockDesc *ParentBlock() const;
-
-  OpDesc *AppendOp();
-
-  void AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc);
-
-  OpDesc *PrependOp();
-
-  void RemoveOp(size_t s, size_t e);
-
-  std::vector<OpDesc *> AllOps() const;
-
-  size_t OpSize() const { return ops_.size(); }
-
-  OpDesc *Op(int idx) { return ops_.at(idx).get(); }
-
-  void Flush();
-
-  proto::BlockDesc *Proto();
-
-  ProgramDesc *Program() { return this->prog_; }
-
- private:
-  void ClearPBOps();
-  void ClearPBVars();
-
- private:
-  ProgramDesc *prog_;       // not_own
-  proto::BlockDesc *desc_;  // not_own
-  bool need_update_;
-
-  std::deque<std::unique_ptr<OpDesc>> ops_;
-  std::unordered_map<std::string, std::unique_ptr<VarDesc>> vars_;
-
-  DISABLE_COPY_AND_ASSIGN(BlockDesc);
-};
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/channel.h b/paddle/framework/channel.h
deleted file mode 100644
index 146f0e9e71ea9101a8f6c71e6c023178f131f967..0000000000000000000000000000000000000000
--- a/paddle/framework/channel.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stddef.h>  // for size_t
-
-namespace paddle {
-namespace framework {
-
-// Channel is the abstract class of buffered and un-buffered channels.
-template <typename T>
-class Channel {
- public:
-  virtual bool Send(T*) = 0;
-  virtual bool Receive(T*) = 0;
-  virtual size_t Cap() = 0;
-  virtual void Close() = 0;
-  virtual ~Channel() {}
-};
-
-// Forward declaration of channel implementations.
-namespace details {
-template <typename T>
-class Buffered;
-template <typename T>
-class UnBuffered;
-}  // namespace details
-
-template <typename T>
-Channel<T>* MakeChannel(size_t buffer_size) {
-  if (buffer_size > 0) {
-    return new details::Buffered<T>(buffer_size);
-  }
-  return new details::UnBuffered<T>();
-}
-
-template <typename T>
-void CloseChannel(Channel<T>* ch) {
-  ch->Close();
-}
-
-}  // namespace framework
-}  // namespace paddle
-
-#include "paddle/framework/details/buffered_channel.h"
-#include "paddle/framework/details/unbuffered_channel.h"
diff --git a/paddle/framework/channel_test.cc b/paddle/framework/channel_test.cc
deleted file mode 100644
index 35567649b888d0c99e0025778284617483a92ebb..0000000000000000000000000000000000000000
--- a/paddle/framework/channel_test.cc
+++ /dev/null
@@ -1,559 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/channel.h"
-
-#include <chrono>
-#include <thread>
-
-#include "gtest/gtest.h"
-
-using paddle::framework::Channel;
-using paddle::framework::MakeChannel;
-using paddle::framework::CloseChannel;
-using paddle::framework::details::Buffered;
-using paddle::framework::details::UnBuffered;
-
-void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
-  unsigned sum_send = 0;
-  std::thread t([&]() {
-    for (int i = 0; i < 5; i++) {
-      EXPECT_EQ(ch->Send(&i), true);
-      sum_send += i;
-    }
-  });
-  for (int i = 0; i < 5; i++) {
-    int recv;
-    EXPECT_EQ(ch->Receive(&recv), true);
-    EXPECT_EQ(recv, i);
-  }
-
-  CloseChannel(ch);
-  t.join();
-  EXPECT_EQ(sum_send, 10U);
-  delete ch;
-}
-
-TEST(Channel, MakeAndClose) {
-  using paddle::framework::details::Buffered;
-  using paddle::framework::details::UnBuffered;
-  {
-    // MakeChannel should return a buffered channel is buffer_size > 0.
-    auto ch = MakeChannel<int>(10);
-    EXPECT_NE(dynamic_cast<Buffered<int> *>(ch), nullptr);
-    EXPECT_EQ(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
-    CloseChannel(ch);
-    delete ch;
-  }
-  {
-    // MakeChannel should return an un-buffered channel is buffer_size = 0.
-    auto ch = MakeChannel<int>(0);
-    EXPECT_EQ(dynamic_cast<Buffered<int> *>(ch), nullptr);
-    EXPECT_NE(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
-    CloseChannel(ch);
-    delete ch;
-  }
-}
-
-TEST(Channel, SufficientBufferSizeDoesntBlock) {
-  const size_t buffer_size = 10;
-  auto ch = MakeChannel<size_t>(buffer_size);
-  for (size_t i = 0; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Send(&i), true);  // should not block
-  }
-
-  size_t out;
-  for (size_t i = 0; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Receive(&out), true);  // should not block
-    EXPECT_EQ(out, i);
-  }
-  CloseChannel(ch);
-  delete ch;
-}
-
-// This tests that a  channel must return false
-// on send and receive performed after closing the channel.
-// Receive will only return false after close when queue is empty.
-// By creating separate threads for sending and receiving, we make this
-// function able to test both buffered and unbuffered channels.
-void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
-  const size_t data = 5;
-  std::thread send_thread{[&]() {
-    size_t i = data;
-    EXPECT_EQ(ch->Send(&i), true);  // should not block
-  }};
-
-  std::thread recv_thread{[&]() {
-    size_t i;
-    EXPECT_EQ(ch->Receive(&i), true);  // should not block
-    EXPECT_EQ(i, data);
-  }};
-
-  send_thread.join();
-  recv_thread.join();
-
-  // After closing send should return false. Receive should
-  // also return false as there is no data in queue.
-  CloseChannel(ch);
-  send_thread = std::thread{[&]() {
-    size_t i = data;
-    EXPECT_EQ(ch->Send(&i), false);  // should return false
-  }};
-  recv_thread = std::thread{[&]() {
-    size_t i;
-    // should return false because channel is closed and queue is empty
-    EXPECT_EQ(ch->Receive(&i), false);
-  }};
-
-  send_thread.join();
-  recv_thread.join();
-}
-
-TEST(Channel, SendReceiveClosedBufferedChannelPanics) {
-  size_t buffer_size = 10;
-  auto ch = MakeChannel<size_t>(buffer_size);
-  SendReceiveWithACloseChannelShouldPanic(ch);
-  delete ch;
-}
-
-TEST(Channel, SendReceiveClosedUnBufferedChannelPanics) {
-  auto ch = MakeChannel<size_t>(0);
-  SendReceiveWithACloseChannelShouldPanic(ch);
-  delete ch;
-}
-
-TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
-  const size_t buffer_size = 10;
-  auto ch = MakeChannel<size_t>(buffer_size);
-
-  for (size_t i = 0; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Send(&i), true);  // sending should not block
-  }
-
-  size_t out;
-  for (size_t i = 0; i < buffer_size / 2; ++i) {
-    EXPECT_EQ(ch->Receive(&out), true);  // receiving should not block
-    EXPECT_EQ(out, i);
-  }
-
-  CloseChannel(ch);
-
-  for (size_t i = buffer_size / 2; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Receive(&out),
-              true);  // receving should return residual values.
-    EXPECT_EQ(out, i);
-  }
-
-  for (size_t i = 0; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Receive(&out),
-              false);  // receiving on closed channel should return false
-  }
-  delete ch;
-}
-
-TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
-  const size_t buffer_size = 10;
-  auto ch = MakeChannel<size_t>(buffer_size);
-  size_t sum = 0;
-  std::thread t([&]() {
-    // Try to write more than buffer size.
-    for (size_t i = 0; i < 2 * buffer_size; ++i) {
-      if (i < buffer_size)
-        EXPECT_EQ(ch->Send(&i), true);  // should block after 10 iterations
-      else
-        EXPECT_EQ(ch->Send(&i), false);
-      sum += i;
-    }
-  });
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
-  EXPECT_EQ(sum, 45U);
-
-  CloseChannel(ch);
-  t.join();
-  delete ch;
-}
-
-TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) {
-  auto ch = MakeChannel<int>(0);
-  RecevingOrderEqualToSendingOrder(ch);
-}
-
-TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel) {
-  auto ch = MakeChannel<int>(10);
-  RecevingOrderEqualToSendingOrder(ch);
-}
-
-// This tests that closing a buffered channel also unblocks
-//  any receivers waiting on the channel
-TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
-  auto ch = MakeChannel<int>(1);
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-
-  // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          int data;
-          // All reads should return false
-          EXPECT_EQ(ch->Receive(&data), false);
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
-
-  // Verify that all threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], false);
-  }
-
-  // Explicitly close the channel
-  // This should unblock all receivers
-  CloseChannel(ch);
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
-  delete ch;
-}
-
-// This tests that closing a buffered channel also unblocks
-//  any senders waiting for channel to have write space
-TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
-  auto ch = MakeChannel<int>(1);
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
-
-  // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
-    thread_ended[i] = false;
-    send_success[i] = false;
-    t[i] = std::thread(
-        [&](bool *ended, bool *success) {
-          int data = 10;
-          *success = ch->Send(&data);
-          *ended = true;
-        },
-        &thread_ended[i], &send_success[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
-
-  // Verify that atleast 4 threads are blocked
-  int ct = 0;
-  for (size_t i = 0; i < num_threads; i++) {
-    if (thread_ended[i] == false) ct++;
-  }
-  // Atleast 4 threads must be blocked
-  EXPECT_GE(ct, 4);
-
-  // Explicitly close the thread
-  // This should unblock all senders
-  CloseChannel(ch);
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-
-  // Verify that only 1 send was successful
-  ct = 0;
-  for (size_t i = 0; i < num_threads; i++) {
-    if (send_success[i]) ct++;
-  }
-  // Only 1 send must be successful
-  EXPECT_EQ(ct, 1);
-
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
-  delete ch;
-}
-
-// This tests that closing an unbuffered channel also unblocks
-//  unblocks any receivers waiting for senders
-TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
-  auto ch = MakeChannel<int>(0);
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-
-  // Launches threads that try to read and are blocked becausew of no writers
-  for (size_t i = 0; i < num_threads; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          int data;
-          EXPECT_EQ(ch->Receive(&data), false);
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
-
-  // Verify that all the threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], false);
-  }
-
-  // Explicitly close the thread
-  // This should unblock all receivers
-  CloseChannel(ch);
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
-
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
-  delete ch;
-}
-
-// This tests that closing an unbuffered channel also unblocks
-//  unblocks any senders waiting for senders
-TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
-  auto ch = MakeChannel<int>(0);
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-
-  // Launches threads that try to read and are blocked becausew of no writers
-  for (size_t i = 0; i < num_threads; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          int data = 10;
-          EXPECT_EQ(ch->Send(&data), false);
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
-
-  // Verify that all the threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], false);
-  }
-
-  // Explicitly close the thread
-  // This should unblock all receivers
-  CloseChannel(ch);
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
-
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
-  delete ch;
-}
-
-TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
-  auto ch = MakeChannel<int>(0);
-  unsigned sum_send = 0;
-  // Send should block after three iterations
-  // since we only have three receivers.
-  std::thread t([&]() {
-    // Try to send more number of times
-    // than receivers
-    for (int i = 0; i < 4; i++) {
-      ch->Send(&i);
-      sum_send += i;
-    }
-  });
-  for (int i = 0; i < 3; i++) {
-    int recv;
-    ch->Receive(&recv);
-    EXPECT_EQ(recv, i);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
-  EXPECT_EQ(sum_send, 3U);
-
-  CloseChannel(ch);
-  t.join();
-  delete ch;
-}
-
-TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
-  auto ch = MakeChannel<int>(0);
-  unsigned sum_send = 0;
-  unsigned sum_receive = 0;
-  // The receiver should block after 5
-  // iterations, since there are only 5 senders.
-  std::thread t([&]() {
-    for (int i = 0; i < 8; i++) {
-      int recv;
-      ch->Receive(&recv);  // should block after the fifth iteration.
-      EXPECT_EQ(recv, i);
-      sum_receive += i;
-    }
-  });
-  for (int i = 0; i < 5; i++) {
-    ch->Send(&i);
-    sum_send += i;
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
-  EXPECT_EQ(sum_send, 10U);
-  EXPECT_EQ(sum_receive, 10U);
-  // send three more elements
-  for (int i = 5; i < 8; i++) {
-    ch->Send(&i);
-    sum_send += i;
-  }
-
-  CloseChannel(ch);
-  t.join();
-  EXPECT_EQ(sum_send, 28U);
-  EXPECT_EQ(sum_receive, 28U);
-  delete ch;
-}
-
-// This tests that destroying a channel unblocks
-//  any senders waiting for channel to have write space
-void ChannelDestroyUnblockSenders(Channel<int> *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
-
-  // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
-    thread_ended[i] = false;
-    send_success[i] = false;
-    t[i] = std::thread(
-        [&](bool *ended, bool *success) {
-          int data = 10;
-          *success = ch->Send(&data);
-          *ended = true;
-        },
-        &thread_ended[i], &send_success[i]);
-  }
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
-  bool is_buffered_channel = false;
-  if (dynamic_cast<Buffered<int> *>(ch)) is_buffered_channel = true;
-
-  if (is_buffered_channel) {
-    // If channel is buffered, verify that atleast 4 threads are blocked
-    int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
-      if (thread_ended[i] == false) ct++;
-    }
-    // Atleast 4 threads must be blocked
-    EXPECT_GE(ct, 4);
-  } else {
-    // Verify that all the threads are blocked
-    for (size_t i = 0; i < num_threads; i++) {
-      EXPECT_EQ(thread_ended[i], false);
-    }
-  }
-  // Explicitly destroy the channel
-  delete ch;
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-
-  // Count number of successfuld sends
-  int ct = 0;
-  for (size_t i = 0; i < num_threads; i++) {
-    if (send_success[i]) ct++;
-  }
-
-  if (is_buffered_channel) {
-    // Only 1 send must be successful
-    EXPECT_EQ(ct, 1);
-  } else {
-    // In unbuffered channel, no send should be successful
-    EXPECT_EQ(ct, 0);
-  }
-
-  // Join all threads
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
-}
-
-// This tests that destroying a channel also unblocks
-//  any receivers waiting on the channel
-void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-
-  // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          int data;
-          // All reads should return false
-          EXPECT_EQ(ch->Receive(&data), false);
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
-
-  // Verify that all threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], false);
-  }
-  // delete the channel
-  delete ch;
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
-}
-
-TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
-  size_t buffer_size = 1;
-  auto ch = MakeChannel<int>(buffer_size);
-  ChannelDestroyUnblockReceivers(ch);
-}
-
-TEST(Channel, BufferedChannelDestroyUnblocksSendersTest) {
-  size_t buffer_size = 1;
-  auto ch = MakeChannel<int>(buffer_size);
-  ChannelDestroyUnblockSenders(ch);
-}
-
-// This tests that destroying an unbuffered channel also unblocks
-//  unblocks any receivers waiting for senders
-TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) {
-  auto ch = MakeChannel<int>(0);
-  ChannelDestroyUnblockReceivers(ch);
-}
-
-TEST(Channel, UnbufferedChannelDestroyUnblocksSendersTest) {
-  auto ch = MakeChannel<int>(0);
-  ChannelDestroyUnblockSenders(ch);
-}
diff --git a/paddle/framework/data_device_transform.cc b/paddle/framework/data_device_transform.cc
deleted file mode 100644
index 5daf5a4e0ab47cf90119ee2f48f1fe89559a1972..0000000000000000000000000000000000000000
--- a/paddle/framework/data_device_transform.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/data_device_transform.h"
-
-namespace paddle {
-namespace framework {
-
-static const platform::DeviceContext* GetDeviceContext(
-    const platform::Place& src_place, const platform::Place& dst_place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-
-  if (platform::is_gpu_place(src_place) && platform::is_cpu_place(dst_place)) {
-    return pool.Get(src_place);
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    return pool.Get(dst_place);
-  } else {
-    PADDLE_THROW(
-        "Currently, model parallelism is only supported between CPU and CUDA");
-  }
-}
-
-void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
-                     Tensor* out) {
-  VLOG(3) << "DeviceTransform in, src_place " << in.place()
-          << " dst_place: " << dst_place;
-  auto* dev_ctx = GetDeviceContext(in.place(), dst_place);
-  dev_ctx->Wait();
-  Copy(in, dst_place, *dev_ctx, out);
-  dev_ctx->Wait();
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/data_device_transform.h b/paddle/framework/data_device_transform.h
deleted file mode 100644
index 39750a85f2787c6d35723bb908011349a771fb7f..0000000000000000000000000000000000000000
--- a/paddle/framework/data_device_transform.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/framework/tensor_util.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-
-void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
-                     Tensor* out);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/data_device_transform_test.cu b/paddle/framework/data_device_transform_test.cu
deleted file mode 100644
index efc05b3106b40bdaa6cd03ce707c677dd58b0730..0000000000000000000000000000000000000000
--- a/paddle/framework/data_device_transform_test.cu
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gtest/gtest.h"
-
-#include "paddle/framework/init.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_info.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/elementwise_op_function.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-
-template <typename T>
-struct AddFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
-};
-
-class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input1 of test op");
-    AddOutput("output", "output of test op");
-    AddAttr<bool>("use_gpu", "force to use gpu kernel").SetDefault(false);
-    AddComment("This is test op");
-  }
-};
-
-class TestOpWithKernel : public OperatorWithKernel {
- public:
-  using OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-  OpKernelType GetExpectedKernelType(
-      const ExecutionContext& ctx) const override {
-    if (Attr<bool>("use_gpu")) {
-      VLOG(3) << "force use gpu kernel";
-      return OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0));
-    } else {
-      VLOG(3) << "use default kernel";
-      return OpKernelType(proto::DataType::FP32,
-                          ctx.Input<Tensor>("input")->place());
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TestKernel : public OpKernel<float> {
- public:
-  void Compute(const ExecutionContext& ctx) const {
-    std::cout << ctx.op().DebugString() << std::endl;
-
-    const Tensor* input = ctx.Input<Tensor>("input");
-
-    std::cout << "input place:" << input->place() << std::endl;
-    auto* output = ctx.Output<framework::LoDTensor>("output");
-    output->Resize(input->dims());
-    output->mutable_data<T>(ctx.GetPlace());
-
-    operators::TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
-        input, input, output, ctx.template device_context<DeviceContext>(),
-        AddFunctor<T>());
-    functor.Run();
-  }
-};
-
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_OP_WITHOUT_GRADIENT(
-    test_op, paddle::framework::TestOpWithKernel,
-    paddle::framework::OpKernelTestProtoAndCheckerMaker);
-REGISTER_OP_CPU_KERNEL(
-    test_op,
-    paddle::framework::TestKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    test_op,
-    paddle::framework::TestKernel<paddle::platform::CUDADeviceContext, float>);
-
-static void BuildVar(const std::string& param_name,
-                     std::initializer_list<const char*> arguments,
-                     paddle::framework::proto::OpDesc::Var* var) {
-  var->set_parameter(param_name);
-  for (auto& arg_name : arguments) {
-    *var->mutable_arguments()->Add() = arg_name;
-  }
-}
-
-TEST(Operator, CPUtoGPU) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  InitDevices();
-
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace cpu_place;
-
-  // create an op to run on CPU
-  paddle::framework::proto::OpDesc cpu_op_desc;
-  cpu_op_desc.set_type("test_op");
-  BuildVar("input", {"IN1"}, cpu_op_desc.add_inputs());
-  BuildVar("output", {"OUT1"}, cpu_op_desc.add_outputs());
-
-  auto cpu_op = paddle::framework::OpRegistry::CreateOp(cpu_op_desc);
-  // prepare input
-  auto* in_t = scope.Var("IN1")->GetMutable<LoDTensor>();
-  auto* src_ptr = in_t->mutable_data<float>({2, 3}, CPUPlace());
-  for (int i = 0; i < 2 * 3; ++i) {
-    src_ptr[i] = static_cast<float>(i);
-  }
-
-  // get output
-  auto* output = scope.Var("OUT1");
-  cpu_op->Run(scope, cpu_place);
-
-  auto* output_ptr = output->Get<LoDTensor>().data<float>();
-  for (int i = 0; i < 2 * 3; ++i) {
-    ASSERT_EQ(output_ptr[i], static_cast<float>(i) * 2);
-  }
-
-  // create an op to run on GPU
-  paddle::framework::proto::OpDesc gpu_op_desc;
-  gpu_op_desc.set_type("test_op");
-  BuildVar("input", {"OUT1"}, gpu_op_desc.add_inputs());
-  BuildVar("output", {"OUT2"}, gpu_op_desc.add_outputs());
-
-  auto attr = gpu_op_desc.mutable_attrs()->Add();
-  attr->set_name("use_gpu");
-  attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
-  attr->set_b(true);
-
-  auto gpu_op = paddle::framework::OpRegistry::CreateOp(gpu_op_desc);
-
-  paddle::platform::CUDAPlace cuda_place(0);
-  // get output
-  auto* output2 = scope.Var("OUT2");
-  gpu_op->Run(scope, cuda_place);
-  VLOG(3) << "after gpu_op run";
-
-  // auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
-  DeviceContextPool& pool = DeviceContextPool::Instance();
-  auto dev_ctx = pool.Get(cuda_place);
-
-  paddle::framework::Tensor output_tensor;
-  Copy(output2->Get<LoDTensor>(), paddle::platform::CPUPlace(), *dev_ctx,
-       &output_tensor);
-
-  dev_ctx->Wait();
-  float* output2_ptr = output_tensor.data<float>();
-  for (int i = 0; i < 2 * 3; ++i) {
-    ASSERT_EQ(output2_ptr[i], static_cast<float>(i) * 4);
-  }
-}
diff --git a/paddle/framework/data_layout.h b/paddle/framework/data_layout.h
deleted file mode 100644
index 31817251ed09a7a1da7223c1e99b2eb369a3de30..0000000000000000000000000000000000000000
--- a/paddle/framework/data_layout.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cctype>
-#include <ostream>
-
-#include "paddle/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-enum class DataLayout {
-  kNHWC = 0,
-  kNCHW = 1,
-  kAnyLayout = 2,
-};
-
-inline DataLayout StringToDataLayout(const std::string& str) {
-  std::string s(str);
-  for (size_t i = 0; i < s.size(); ++i) {
-    s[i] = toupper(s[i]);
-  }
-
-  if (s == "NHWC") {
-    return DataLayout::kNHWC;
-  } else if (s == "NCHW") {
-    return DataLayout::kNCHW;
-  } else if (s == "ANYLAYOUT") {
-    return DataLayout::kAnyLayout;
-  } else {
-    PADDLE_THROW("Unknown storage order string: %s", s);
-  }
-}
-
-inline std::string DataLayoutToString(const DataLayout& data_layout) {
-  switch (data_layout) {
-    case DataLayout::kNHWC:
-      return "NHWC";
-    case DataLayout::kNCHW:
-      return "NCHW";
-    case DataLayout::kAnyLayout:
-      return "ANY_LAYOUT";
-    default:
-      PADDLE_THROW("unknown DataLayou %d", data_layout);
-  }
-}
-
-inline std::ostream& operator<<(std::ostream& out, const DataLayout& l) {
-  out << DataLayoutToString(l);
-  return out;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/data_layout_transform.cc b/paddle/framework/data_layout_transform.cc
deleted file mode 100644
index 9d0a6d5ea3eb4127763acbd1f7a219aa19db4eca..0000000000000000000000000000000000000000
--- a/paddle/framework/data_layout_transform.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/framework/data_layout_transform.h"
-
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace framework {
-
-std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to) {
-  PADDLE_ENFORCE_NE(from, to,
-                    "layout transform should transform different layout");
-  if (from == DataLayout::kNCHW && to == DataLayout::kNHWC) {
-    return {0, 2, 3, 1};
-  } else if (from == DataLayout::kNHWC && to == DataLayout::kNCHW) {
-    return {0, 3, 1, 2};
-  } else {
-    PADDLE_THROW("unsupported transform");
-  }
-}
-
-struct CastDataLayout {
-  CastDataLayout(const platform::DeviceContext* ctx,
-                 const std::vector<int>& axis, const framework::Tensor& in,
-                 framework::Tensor* out)
-      : in_(in), out_(out), ctx_(ctx), axis_(axis) {}
-  const framework::Tensor in_;
-  framework::Tensor* out_;
-  const platform::DeviceContext* ctx_;
-  const std::vector<int> axis_;
-
-  template <typename T>
-  void operator()() {
-    auto place = ctx_->GetPlace();
-
-    if (platform::is_cpu_place(place)) {
-      operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
-      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
-      trans4(*context, in_, out_, axis_);
-    } else {
-      PADDLE_THROW("Unsupport CPU <-> GPU!");
-    }
-  }
-};
-
-void TransDataLayout(const OpKernelType& kernel_type_for_var,
-                     const OpKernelType& expected_kernel_type, const Tensor& in,
-                     Tensor* out) {
-  PADDLE_ENFORCE(
-      platform::places_are_same_class(kernel_type_for_var.place_,
-                                      expected_kernel_type.place_),
-      "TransDataLayout only support DataLayout transform on same place!");
-
-  PADDLE_ENFORCE(arity(in.dims()) == 4, "Input Arity only support 4!");
-
-  auto& pool = platform::DeviceContextPool::Instance();
-
-  auto src_dim = in.dims();
-  std::vector<int64_t> dst_dim;
-
-  auto axis = GetAxis(kernel_type_for_var.data_layout_,
-                      expected_kernel_type.data_layout_);
-  dst_dim.resize(axis.size());
-  for (size_t i = 0; i < axis.size(); i++) {
-    dst_dim[i] = src_dim[axis[i]];
-  }
-
-  out->Resize(make_ddim(dst_dim));
-  out->mutable_data(expected_kernel_type.place_, in.type());
-
-  framework::VisitDataType(
-      framework::ToDataType(in.type()),
-      CastDataLayout(pool.Get(expected_kernel_type.place_), axis, in, out));
-
-  out->set_layout(expected_kernel_type.data_layout_);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/data_layout_transform.h b/paddle/framework/data_layout_transform.h
deleted file mode 100644
index 368f7fc9898338af0f9502cbc1e94cc40ae12e3b..0000000000000000000000000000000000000000
--- a/paddle/framework/data_layout_transform.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/framework/op_kernel_type.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/framework/variable.h"
-
-namespace paddle {
-namespace framework {
-
-std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
-
-void TransDataLayout(const OpKernelType& kernel_type_for_var,
-                     const OpKernelType& expected_kernel_type, const Tensor& in,
-                     Tensor* out);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/data_layout_transform_test.cc b/paddle/framework/data_layout_transform_test.cc
deleted file mode 100644
index 093e8d4d3458479763415dbc24957eaaa0f6f0fe..0000000000000000000000000000000000000000
--- a/paddle/framework/data_layout_transform_test.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/framework/data_layout_transform.h"
-
-#include "gtest/gtest.h"
-#include "paddle/platform/device_context.h"
-
-TEST(DataTransform, DataLayoutFunction) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-
-  auto place = CPUPlace();
-  Tensor in = Tensor();
-  Tensor out = Tensor();
-  in.mutable_data<double>(make_ddim({2, 3, 1, 2}), place);
-  in.set_layout(DataLayout::kNHWC);
-
-  auto kernel_nhwc = OpKernelType(proto::DataType::FP32, place,
-                                  DataLayout::kNHWC, LibraryType::kPlain);
-  auto kernel_ncwh = OpKernelType(proto::DataType::FP32, place,
-                                  DataLayout::kNCHW, LibraryType::kPlain);
-
-  TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out);
-
-  EXPECT_TRUE(out.layout() == DataLayout::kNCHW);
-  EXPECT_TRUE(out.dims() == make_ddim({2, 2, 3, 1}));
-
-  TransDataLayout(kernel_ncwh, kernel_nhwc, in, &out);
-
-  EXPECT_TRUE(in.layout() == DataLayout::kNHWC);
-  EXPECT_TRUE(in.dims() == make_ddim({2, 3, 1, 2}));
-}
\ No newline at end of file
diff --git a/paddle/framework/data_transform.cc b/paddle/framework/data_transform.cc
deleted file mode 100644
index b6fd46401ffdd50cfdb00cc6e4ecd821bb39aba5..0000000000000000000000000000000000000000
--- a/paddle/framework/data_transform.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/data_transform.h"
-
-#include "paddle/framework/data_device_transform.h"
-#include "paddle/framework/data_layout_transform.h"
-#include "paddle/framework/data_type_transform.h"
-
-namespace paddle {
-namespace framework {
-
-static void PassTensorData(Tensor* from, Tensor* to) {
-  to->ShareDataWith(*from);
-  *from = Tensor();
-}
-
-void DataTransform(const OpKernelType& expected_kernel_type,
-                   const OpKernelType& kernel_type_for_var,
-                   const Tensor& input_tensor, Tensor* output_tensor) {
-  bool transformed = false;
-  Tensor in;
-  in.ShareDataWith(input_tensor);
-  Tensor out;
-
-  // do layout transform
-  if (NeedTransformLayout(expected_kernel_type.data_layout_,
-                          kernel_type_for_var.data_layout_)) {
-    TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out);
-    transformed = true;
-    PassTensorData(&out, &in);
-  }
-
-  if (expected_kernel_type.data_type_ != kernel_type_for_var.data_type_) {
-    TransDataType(kernel_type_for_var, expected_kernel_type, in, &out);
-    transformed = true;
-    PassTensorData(&out, &in);
-  }
-
-  // do device transform
-  if (!platform::is_same_place(kernel_type_for_var.place_,
-                               expected_kernel_type.place_)) {
-    TransDataDevice(in, expected_kernel_type.place_, &out);
-    transformed = true;
-    PassTensorData(&out, &in);
-  }
-
-  PADDLE_ENFORCE(transformed, "No transform is applied, please check!");
-  // get output data
-  output_tensor->ShareDataWith(in);
-}
-
-void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
-                            Variable& out_var) {
-  if (in_var.IsType<LoDTensor>()) {
-    auto& in_lod_tensor = in_var.Get<LoDTensor>();
-    auto* tran_lod_tensor = out_var.GetMutable<LoDTensor>();
-    tran_lod_tensor->set_lod(in_lod_tensor.lod());
-    tran_lod_tensor->set_layout(in_lod_tensor.layout());
-    tran_lod_tensor->ShareDataWith(tensor);
-  } else if (in_var.IsType<SelectedRows>()) {
-    auto& in_selected_rows = in_var.Get<SelectedRows>();
-    auto* trans_selected_rows = out_var.GetMutable<SelectedRows>();
-    trans_selected_rows->set_height(in_selected_rows.height());
-    trans_selected_rows->set_rows(in_selected_rows.rows());
-    trans_selected_rows->mutable_value()->ShareDataWith(tensor);
-  } else {
-    PADDLE_THROW("unknown var type");
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/data_transform.h b/paddle/framework/data_transform.h
deleted file mode 100644
index a4b78902379d5eb89f92ec47655ff93b49d0bfab..0000000000000000000000000000000000000000
--- a/paddle/framework/data_transform.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <utility>
-#include <vector>
-
-#include "paddle/framework/op_kernel_type.h"
-#include "paddle/framework/selected_rows.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/framework/variable.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/macros.h"
-#include "paddle/platform/transform.h"
-
-namespace paddle {
-namespace framework {
-
-void DataTransform(const OpKernelType& expected_kernel_type,
-                   const OpKernelType& kernel_type_for_var,
-                   const Tensor& input_tensor, Tensor* out);
-
-void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
-                            Variable& out_var);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
deleted file mode 100644
index 98eb3e857d1943e71f1d41f24ecbedbe09e85b7b..0000000000000000000000000000000000000000
--- a/paddle/framework/data_type.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <typeindex>
-#include "paddle/framework/framework.pb.h"
-#include "paddle/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-inline proto::DataType ToDataType(std::type_index type) {
-  using namespace paddle::framework::proto;
-  if (typeid(float).hash_code() == type.hash_code()) {
-    return DataType::FP32;
-  } else if (typeid(double).hash_code() == type.hash_code()) {
-    return DataType::FP64;
-  } else if (typeid(int).hash_code() == type.hash_code()) {
-    return DataType::INT32;
-  } else if (typeid(int64_t).hash_code() == type.hash_code()) {
-    return DataType::INT64;
-  } else if (typeid(bool).hash_code() == type.hash_code()) {
-    return DataType::BOOL;
-  } else {
-    PADDLE_THROW("Not supported");
-  }
-}
-
-inline std::type_index ToTypeIndex(proto::DataType type) {
-  using namespace paddle::framework::proto;
-  switch (type) {
-    case DataType::FP32:
-      return typeid(float);
-    case DataType::FP64:
-      return typeid(double);
-    case DataType::INT32:
-      return typeid(int);
-    case DataType::INT64:
-      return typeid(int64_t);
-    case DataType::BOOL:
-      return typeid(bool);
-    default:
-      PADDLE_THROW("Not support type %d", type);
-  }
-}
-
-template <typename Visitor>
-inline void VisitDataType(proto::DataType type, Visitor visitor) {
-  using namespace paddle::framework::proto;
-  switch (type) {
-    case DataType::FP32:
-      visitor.template operator()<float>();
-      break;
-    case DataType::FP64:
-      visitor.template operator()<double>();
-      break;
-    case DataType::INT32:
-      visitor.template operator()<int>();
-      break;
-    case DataType::INT64:
-      visitor.template operator()<int64_t>();
-      break;
-    case DataType::BOOL:
-      visitor.template operator()<bool>();
-      break;
-    default:
-      PADDLE_THROW("Not supported");
-  }
-}
-
-inline std::string DataTypeToString(const proto::DataType type) {
-  using namespace paddle::framework::proto;
-  switch (type) {
-    case DataType::FP16:
-      return "float16";
-    case DataType::FP32:
-      return "float32";
-    case DataType::FP64:
-      return "float64";
-    case DataType::INT16:
-      return "int16";
-    case DataType::INT32:
-      return "int32";
-    case DataType::INT64:
-      return "int64";
-    case DataType::BOOL:
-      return "bool";
-    default:
-      PADDLE_THROW("Not support type %d", type);
-  }
-}
-
-inline std::ostream& operator<<(std::ostream& out,
-                                const proto::DataType& type) {
-  out << DataTypeToString(type);
-  return out;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/data_type_transform.cc b/paddle/framework/data_type_transform.cc
deleted file mode 100644
index 7df1cc6b75b4e87b57813aa01a1b29302d616158..0000000000000000000000000000000000000000
--- a/paddle/framework/data_type_transform.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/data_type_transform.h"
-
-#include "paddle/framework/selected_rows.h"
-#include "paddle/platform/transform.h"
-
-namespace paddle {
-namespace framework {
-
-template <typename InType, typename OutType>
-struct CastDataTypeFunctor {
-  HOSTDEVICE inline OutType operator()(InType in) const {
-    return static_cast<OutType>(in);
-  }
-};
-
-template <typename InType>
-struct CastDataType {
-  CastDataType(const framework::Tensor& in, framework::Tensor* out,
-               const platform::DeviceContext* ctx)
-      : in_(in), out_(out), ctx_(ctx) {}
-  const framework::Tensor in_;
-  framework::Tensor* out_;
-  const platform::DeviceContext* ctx_;
-
-  template <typename OutType>
-  void operator()() {
-    auto* in_begin = in_.data<InType>();
-    auto* in_end = in_begin + in_.numel();
-    auto* out_begin = out_->mutable_data<OutType>(in_.place());
-
-    if (platform::is_cpu_place(in_.place())) {
-      platform::Transform<platform::CPUDeviceContext> trans;
-      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
-      trans(*context, in_begin, in_end, out_begin,
-            CastDataTypeFunctor<InType, OutType>());
-    } else {
-      // TODO(dzhwinter): enhance Copy CPU<->GPU with different data type?
-      PADDLE_THROW("Unsupport CPU <-> GPU!");
-    }
-  }
-};
-
-void TransDataType(const OpKernelType& kernel_type_for_var,
-                   const OpKernelType& expected_kernel_type, const Tensor& in,
-                   Tensor* out) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-
-  out->Resize(in.dims());
-  auto src_type = kernel_type_for_var.data_type_;
-  auto dst_type = expected_kernel_type.data_type_;
-  auto ctx = pool.Get(in.place());
-
-  switch (src_type) {
-    case proto::DataType::FP32:
-      framework::VisitDataType(dst_type, CastDataType<float>(in, out, ctx));
-      break;
-    case proto::DataType::FP64:
-      framework::VisitDataType(dst_type, CastDataType<double>(in, out, ctx));
-      break;
-    case proto::DataType::INT32:
-      framework::VisitDataType(dst_type, CastDataType<int>(in, out, ctx));
-      break;
-    case proto::DataType::INT64:
-      framework::VisitDataType(dst_type, CastDataType<int64_t>(in, out, ctx));
-      break;
-    case proto::DataType::BOOL:
-      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
-      break;
-    default:
-      PADDLE_THROW("Not support type %d", src_type);
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/data_type_transform.h b/paddle/framework/data_type_transform.h
deleted file mode 100644
index 067c0c2a5b14465a31cabf7b5d4442d6a3a1c773..0000000000000000000000000000000000000000
--- a/paddle/framework/data_type_transform.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/op_kernel_type.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/framework/variable.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-
-using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
-
-void TransDataType(const OpKernelType& kernel_type_for_var,
-                   const OpKernelType& expected_kernel_type, const Tensor& in,
-                   Tensor* out);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/data_type_transform_test.cc b/paddle/framework/data_type_transform_test.cc
deleted file mode 100644
index 89d32f528334a159e1b015802ff2670f9cfc1584..0000000000000000000000000000000000000000
--- a/paddle/framework/data_type_transform_test.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/data_type_transform.h"
-
-#include "gtest/gtest.h"
-
-TEST(DataTypeTransform, CPUTransform) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-
-  auto place = CPUPlace();
-
-  Tensor in;
-  Tensor out;
-
-  float* ptr = in.mutable_data<float>(make_ddim({2, 3}), place);
-  int data_number = 2 * 3;
-
-  for (int i = 0; i < data_number; ++i) {
-    ptr[i] = i / 3;
-  }
-
-  auto kernel_fp32 = OpKernelType(proto::DataType::FP32, place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_fp64 = OpKernelType(proto::DataType::FP64, place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_int32 = OpKernelType(proto::DataType::INT32, place,
-                                   DataLayout::kAnyLayout, LibraryType::kPlain);
-
-  TransDataType(kernel_fp32, kernel_fp64, in, &out);
-  double* out_data_double = out.data<double>();
-  for (int i = 0; i < data_number; ++i) {
-    ASSERT_EQ(out_data_double[i], static_cast<double>(i / 3));
-  }
-
-  TransDataType(kernel_fp32, kernel_int32, in, &out);
-  int* out_data_int = out.data<int>();
-  for (int i = 0; i < data_number; ++i) {
-    ASSERT_EQ(out_data_int[i], static_cast<int>(i / 3));
-  }
-}
diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
deleted file mode 100644
index 8b6f42b82df14bfcd25f33ef16b5903fb965a8ba..0000000000000000000000000000000000000000
--- a/paddle/framework/ddim.cc
+++ /dev/null
@@ -1,318 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/ddim.h"
-#include "paddle/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-/// @cond HIDDEN
-
-template <int i>
-Dim<i> make_dim(const int64_t* d) {
-  return Dim<i>(*d, make_dim<i - 1>(d + 1));
-}
-
-template <>
-Dim<1> make_dim<1>(const int64_t* d) {
-  return Dim<1>(*d);
-}
-
-void make_ddim(DDim& ddim, const int64_t* dims, int n) {
-  switch (n) {
-    case 1:
-      ddim = make_dim<1>(dims);
-      break;
-    case 2:
-      ddim = make_dim<2>(dims);
-      break;
-    case 3:
-      ddim = make_dim<3>(dims);
-      break;
-    case 4:
-      ddim = make_dim<4>(dims);
-      break;
-    case 5:
-      ddim = make_dim<5>(dims);
-      break;
-    case 6:
-      ddim = make_dim<6>(dims);
-      break;
-    case 7:
-      ddim = make_dim<7>(dims);
-      break;
-    case 8:
-      ddim = make_dim<8>(dims);
-      break;
-    case 9:
-      ddim = make_dim<9>(dims);
-      break;
-    default:
-      PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions.");
-  }
-}
-
-/// @endcond
-
-DDim make_ddim(std::initializer_list<int64_t> dims) {
-  DDim result(make_dim(0));
-  make_ddim(result, dims.begin(), dims.size());
-  return result;
-}
-
-DDim make_ddim(const std::vector<int64_t>& dims) {
-  DDim result(make_dim(0));
-  make_ddim(result, &dims[0], dims.size());
-  return result;
-}
-
-DDim make_ddim(const std::vector<int>& dims) {
-  std::vector<int64_t> res(dims.size());
-  std::transform(dims.begin(), dims.end(), res.begin(),
-                 [](int d) { return static_cast<int64_t>(d); });
-  return make_ddim(res);
-}
-
-/// @cond HIDDEN
-// XXX For some reason, putting this in an anonymous namespace causes errors
-class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
- public:
-  explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
-
-  template <int D>
-  int64_t& operator()(Dim<D>& dim) const {
-    return dim[idx_];
-  }
-
- private:
-  int idx_;
-};
-
-class DynamicConstIndexer : public boost::static_visitor<int64_t> {
- public:
-  explicit DynamicConstIndexer(int idx) : idx_(idx) {}
-
-  template <int D>
-  int64_t operator()(const Dim<D>& dim) const {
-    return dim[idx_];
-  }
-
- private:
-  int idx_;
-};
-
-/// @endcond
-
-int64_t& DDim::operator[](int idx) {
-  return boost::apply_visitor(DynamicMutableIndexer(idx), var);
-}
-
-int64_t DDim::operator[](int idx) const {
-  return boost::apply_visitor(DynamicConstIndexer(idx), var);
-}
-
-int DDim::size() const { return arity(*this); }
-
-bool DDim::operator==(DDim d) const {
-  if (var.which() != d.getVar().which()) {
-    return false;
-  } else {
-    std::vector<int64_t> v1 = vectorize(*this);
-    std::vector<int64_t> v2 = vectorize(d);
-
-    for (unsigned int i = 0; i < v1.size(); i++) {
-      if (v1[i] != v2[i]) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-}
-
-bool DDim::operator!=(DDim d) const { return !(*this == d); }
-
-DDim DDim::operator+(DDim d) const {
-  std::vector<int64_t> v1 = vectorize(*this);
-  std::vector<int64_t> v2 = vectorize(d);
-
-  std::vector<int64_t> v3;
-
-  assert(v1.size() == v2.size());
-
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    v3.push_back(v1[i] + v2[i]);
-  }
-
-  return make_ddim(v3);
-}
-
-DDim DDim::operator*(DDim d) const {
-  std::vector<int64_t> v1 = vectorize(*this);
-  std::vector<int64_t> v2 = vectorize(d);
-
-  std::vector<int64_t> v3;
-
-  assert(v1.size() == v2.size());
-
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    v3.push_back(v1[i] * v2[i]);
-  }
-
-  return make_ddim(v3);
-}
-
-int64_t get(const DDim& ddim, int idx) { return ddim[idx]; }
-
-void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
-
-/// @cond HIDDEN
-struct VectorizeVisitor : public boost::static_visitor<> {
-  std::vector<int64_t>& vector;
-
-  explicit VectorizeVisitor(std::vector<int64_t>& v) : vector(v) {}
-
-  template <typename T>
-  void operator()(const T& t) {
-    vector.push_back(t.head);
-    this->operator()(t.tail);
-  }
-
-  void operator()(const Dim<1>& t) { vector.push_back(t.head); }
-};
-/// @endcond
-
-std::vector<int64_t> vectorize(const DDim& ddim) {
-  std::vector<int64_t> result;
-  VectorizeVisitor visitor(result);
-  boost::apply_visitor(visitor, ddim);
-  return result;
-}
-
-// NOTE: framework::vectorize converts to type int64_t
-//       which does not fit cudnn inputs.
-std::vector<int> vectorize2int(const DDim& ddim) {
-  std::vector<int64_t> temp = vectorize(ddim);
-  std::vector<int> result(temp.begin(), temp.end());
-  return result;
-}
-
-struct ProductVisitor : public boost::static_visitor<int64_t> {
-  template <int D>
-  int64_t operator()(const Dim<D>& dim) {
-    return product(dim);
-  }
-};
-
-int64_t product(const DDim& ddim) {
-  ProductVisitor visitor;
-  return boost::apply_visitor(visitor, ddim);
-}
-
-struct SliceVectorizeVisitor : public boost::static_visitor<> {
-  std::vector<int64_t>& vector;
-  int begin;
-  int end;
-
-  SliceVectorizeVisitor(std::vector<int64_t>& v, int b, int e)
-      : vector(v), begin(b), end(e) {
-    PADDLE_ENFORCE(begin < end,
-                   "Begin index must be less than end index in ddim slice.");
-    PADDLE_ENFORCE(begin >= 0,
-                   "Begin index can't be less than zero in ddim slice.");
-  }
-
-  template <int S>
-  void operator()(const Dim<S>& dim) {
-    if (begin == 0) {
-      vector.push_back(dim.head);
-    } else {
-      --begin;
-    }
-    --end;
-    if (end > 0) {
-      this->operator()(dim.tail);
-    }
-  }
-
-  void operator()(const Dim<1>& dim) {
-    PADDLE_ENFORCE(end == 1, "End index in ddim slice is out of bound.");
-    vector.push_back(dim.head);
-  }
-};
-
-DDim slice_ddim(const DDim& dim, int begin, int end) {
-  std::vector<int64_t> vec;
-  vec.reserve(end - begin);
-  SliceVectorizeVisitor visitor(vec, begin, end);
-  boost::apply_visitor(visitor, dim);
-  return make_ddim(vec);
-}
-
-/// \cond HIDDEN
-
-struct ArityVisitor : boost::static_visitor<int> {
-  template <int D>
-  int operator()(Dim<D>) const {
-    return D;
-  }
-};
-
-/// \endcond
-
-int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }
-
-/// \cond HIDDEN
-
-struct DDimPrinter : boost::static_visitor<void> {
-  std::ostream& os;
-  explicit DDimPrinter(std::ostream& os_) : os(os_) {}
-
-  template <typename T>
-  void operator()(const T& t) {
-    os << t;
-  }
-};
-
-/// \endcond
-
-std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
-  DDimPrinter printer(os);
-  boost::apply_visitor(printer, ddim);
-  return os;
-}
-
-DDim::DDim(std::initializer_list<int64_t> init_list) {
-  *this = make_ddim(init_list);
-}
-
-DDim flatten_to_2d(const DDim& src, int num_col_dims) {
-  int rank = src.size();
-  return make_ddim({product(slice_ddim(src, 0, num_col_dims)),
-                    product(slice_ddim(src, num_col_dims, rank))});
-}
-
-DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); }
-
-DDim stride(const DDim& ddim) {
-  std::vector<int64_t> strides(ddim.size());
-  strides[ddim.size() - 1] = 1;
-  for (int i = ddim.size() - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * ddim[i + 1];
-  }
-  return framework::make_ddim(strides);
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
deleted file mode 100644
index 4ca5e49566b7ec006eba80f3f9808bacb1ff2615..0000000000000000000000000000000000000000
--- a/paddle/framework/ddim.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <initializer_list>
-#include <stdexcept>
-#include <vector>
-#include "paddle/framework/dim.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/variant.h"
-
-namespace paddle {
-namespace framework {
-
-/**
- * \brief A dynamically sized dimension.
- *
- * The number of dimensions must be between [1, 9].
- */
-struct DDim {
-  typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
-                         Dim<8>, Dim<9>>
-      DDimVar;
-  DDimVar var;
-
-  DDim() : var(Dim<1>()) {}
-
-  template <int D>
-  explicit DDim(const Dim<D>& in) : var(in) {}
-
-  /*implicit*/ DDim(std::initializer_list<int64_t> init_list);
-
-  template <int D>
-  DDim& operator=(const Dim<D>& in) {
-    var = in;
-    return *this;
-  }
-
-  int64_t& operator[](int idx);
-  int64_t operator[](int idx) const;
-
-  template <typename Visitor>
-  typename Visitor::result_type apply_visitor(Visitor& visitor) {
-    return var.apply_visitor(visitor);
-  }
-
-  template <typename Visitor>
-  typename Visitor::result_type apply_visitor(Visitor& visitor) const {
-    return var.apply_visitor(visitor);
-  }
-
-  DDimVar getVar() { return var; }
-
-  bool operator==(DDim d) const;
-
-  bool operator!=(DDim d) const;
-
-  DDim operator+(DDim d) const;
-
-  DDim operator*(DDim d) const;
-
-  int size() const;
-};
-
-/**
- * \brief Make a DDim from std::vector<int64_t>
- *
- * \param dims An vector of ints. Must be sized between [1, 9]
- */
-DDim make_ddim(const std::vector<int64_t>& dims);
-
-DDim make_ddim(const std::vector<int>& dims);
-
-/**
- * \brief Make a DDim from an initializer list
- *
- * \param dims An initializer list of ints. Must be sized between [1, 9]
- *
- */
-DDim make_ddim(std::initializer_list<int64_t> dims);
-
-int64_t get(const DDim& dim, int idx);
-void set(DDim& dim, int idx, int val);
-
-std::vector<int64_t> vectorize(const DDim& ddim);
-std::vector<int> vectorize2int(const DDim& ddim);
-
-int64_t product(const DDim& ddim);
-
-/**
- * \brief Slice a ddim
- *
- * Slice dim with [begin, end).
- * e.g.  DDim d = make_ddim({1,2,3,4,5});
- *       slice_ddim(d, 1, 3); ====> {2,3}
- */
-DDim slice_ddim(const DDim& dim, int begin, int end);
-
-/**
- * \brief What is the length of this dimension?
- *
- * \param Dynamic dimension to inspect
- */
-
-int arity(const DDim& ddim);
-
-std::ostream& operator<<(std::ostream&, const DDim&);
-
-// Reshape a tensor to a matrix. The matrix's first dimension(column length)
-// will be the product of tensor's first `num_col_dims` dimensions.
-DDim flatten_to_2d(const DDim& src, int num_col_dims);
-
-DDim flatten_to_1d(const DDim& src);
-
-DDim stride(const DDim& ddim);
-}  // namespace framework
-}  // namespace paddle
-
-namespace boost {
-
-template <typename T>
-T get(const paddle::framework::DDim& in) {
-  return boost::get<T>(in.var);
-}
-
-}  // namespace boost
diff --git a/paddle/framework/ddim_test.cc b/paddle/framework/ddim_test.cc
deleted file mode 100644
index bc259d1f603fb34ac8546c388669d8c5c1250bd1..0000000000000000000000000000000000000000
--- a/paddle/framework/ddim_test.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <sstream>
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/framework/ddim.h"
-
-TEST(DDim, Equality) {
-  // construct a DDim from an initialization list
-  paddle::framework::DDim ddim = paddle::framework::make_ddim({9, 1, 5});
-  EXPECT_EQ(ddim[0], 9);
-  EXPECT_EQ(ddim[1], 1);
-  EXPECT_EQ(ddim[2], 5);
-
-  // construct a DDim from a vector
-  std::vector<int64_t> vec({9, 1, 5});
-  paddle::framework::DDim vddim = paddle::framework::make_ddim(vec);
-  EXPECT_EQ(ddim[0], 9);
-  EXPECT_EQ(ddim[1], 1);
-  EXPECT_EQ(ddim[2], 5);
-
-  // mutate a DDim
-  ddim[1] = 2;
-  EXPECT_EQ(ddim[1], 2);
-  paddle::framework::set(ddim, 0, 6);
-  EXPECT_EQ(paddle::framework::get(ddim, 0), 6);
-
-  // vectorize a DDim
-  std::vector<int64_t> res_vec = paddle::framework::vectorize(vddim);
-  EXPECT_EQ(res_vec[0], 9);
-  EXPECT_EQ(res_vec[1], 1);
-  EXPECT_EQ(res_vec[2], 5);
-  paddle::framework::Dim<3> d(3, 2, 1);
-  res_vec = paddle::framework::vectorize(paddle::framework::DDim(d));
-  EXPECT_EQ(res_vec[0], 3);
-  EXPECT_EQ(res_vec[1], 2);
-  EXPECT_EQ(res_vec[2], 1);
-
-  // add two DDims
-  paddle::framework::DDim ddim_sum = ddim + vddim;
-  EXPECT_EQ(ddim_sum[0], 15);
-  EXPECT_EQ(ddim_sum[1], 3);
-  EXPECT_EQ(ddim_sum[2], 10);
-
-  // multiply two DDims
-  paddle::framework::DDim ddim_mul = ddim * vddim;
-  EXPECT_EQ(ddim_mul[0], 54);
-  EXPECT_EQ(ddim_mul[1], 2);
-  EXPECT_EQ(ddim_mul[2], 25);
-
-  // arity of a DDim
-  EXPECT_EQ(paddle::framework::arity(ddim), 3);
-  EXPECT_EQ(ddim.size(), 3);
-
-  // product of a DDim
-  EXPECT_EQ(paddle::framework::product(vddim), 45);
-  EXPECT_EQ(
-      paddle::framework::product(paddle::framework::make_ddim({3, 2, 5, 3})),
-      90);
-
-  // slice a DDim
-  paddle::framework::DDim ddim2 =
-      paddle::framework::make_ddim({1, 2, 3, 4, 5, 6});
-  paddle::framework::DDim ss = paddle::framework::slice_ddim(ddim2, 2, 5);
-  EXPECT_EQ(arity(ss), 3);
-  EXPECT_EQ(ss[0], 3);
-  EXPECT_EQ(ss[1], 4);
-  EXPECT_EQ(ss[2], 5);
-  paddle::framework::DDim ss2 = paddle::framework::slice_ddim(ddim2, 0, 6);
-  EXPECT_EQ(arity(ss2), 6);
-  EXPECT_EQ(ss2[0], 1);
-  EXPECT_EQ(ss2[1], 2);
-  EXPECT_EQ(ss2[2], 3);
-  EXPECT_EQ(ss2[3], 4);
-  EXPECT_EQ(ss2[4], 5);
-  EXPECT_EQ(ss2[5], 6);
-}
-
-TEST(DDim, Print) {
-  // print a DDim
-  std::stringstream ss;
-  paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 3, 4});
-  ss << ddim;
-  EXPECT_EQ("2, 3, 4", ss.str());
-}
diff --git a/paddle/framework/details/buffered_channel.h b/paddle/framework/details/buffered_channel.h
deleted file mode 100644
index c6e4bec0f325ea7649021baa387b5ce9f23b4133..0000000000000000000000000000000000000000
--- a/paddle/framework/details/buffered_channel.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <atomic>
-#include <condition_variable>
-#include <deque>
-#include <mutex>
-
-#include "paddle/framework/channel.h"
-#include "paddle/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-template <typename T>
-class Buffered : public paddle::framework::Channel<T> {
-  friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
-  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
-
- public:
-  virtual bool Send(T*);
-  virtual bool Receive(T*);
-  virtual size_t Cap() { return cap_; }
-  virtual void Close();
-  virtual ~Buffered();
-
- private:
-  size_t cap_;
-  std::mutex mu_;
-  std::condition_variable empty_cond_var_;
-  std::condition_variable full_cond_var_;
-  std::condition_variable destructor_cond_var_;
-  std::deque<T> channel_;
-  std::atomic<bool> closed_{false};
-  std::atomic<unsigned> send_ctr{0};
-  std::atomic<unsigned> recv_ctr{0};
-
-  Buffered(size_t cap) : cap_(cap), closed_(false) {
-    PADDLE_ENFORCE_GT(cap, 0);
-  }
-
-  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
-};
-
-template <typename T>
-bool Buffered<T>::Send(T* item) {
-  bool ret = false;
-  if (closed_) {
-    return ret;
-  }
-  send_ctr++;
-  std::unique_lock<std::mutex> lock(mu_);
-  full_cond_var_.wait(lock,
-                      [this]() { return channel_.size() < cap_ || closed_; });
-  if (!closed_) {
-    channel_.push_back(std::move(*item));
-    lock.unlock();
-    empty_cond_var_.notify_one();
-    ret = true;
-  }
-  send_ctr--;
-  destructor_cond_var_.notify_one();
-  return ret;
-}
-
-template <typename T>
-bool Buffered<T>::Receive(T* item) {
-  bool ret = false;
-  // Once the channel has been closed and all data has been consumed,
-  // just return false. Don't even try acquiring the mutex.
-  if (closed_ && channel_.empty()) {
-    return false;
-  }
-  recv_ctr++;
-  std::unique_lock<std::mutex> lock(mu_);
-  empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
-  if (!channel_.empty()) {
-    *item = std::move(channel_.front());
-    channel_.pop_front();
-    full_cond_var_.notify_one();
-    ret = true;
-  }
-  recv_ctr--;
-  destructor_cond_var_.notify_one();
-  return ret;
-}
-
-template <typename T>
-void Buffered<T>::Close() {
-  if (closed_) {
-    return;
-  }
-  std::unique_lock<std::mutex> lock(mu_);
-  closed_ = true;
-  NotifyAllParticipants(&lock);
-}
-
-template <typename T>
-Buffered<T>::~Buffered() {
-  std::unique_lock<std::mutex> lock(mu_);
-  closed_ = true;
-  channel_.clear();
-  NotifyAllParticipants(&lock);
-
-  // The destructor must wait for all readers and writers to complete their task
-  // The channel has been closed, so we will not accept new readers and writers
-  lock.lock();
-  destructor_cond_var_.wait(
-      lock, [this]() { return send_ctr == 0 && recv_ctr == 0; });
-}
-
-template <typename T>
-void Buffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
-  lock->unlock();
-  full_cond_var_.notify_all();
-  empty_cond_var_.notify_all();
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/details/cow_ptr_test.cc b/paddle/framework/details/cow_ptr_test.cc
deleted file mode 100644
index 1f4a12bca0dcab2d146cc62cd7ce1c2d7abcddf9..0000000000000000000000000000000000000000
--- a/paddle/framework/details/cow_ptr_test.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/framework/details/cow_ptr.h"
-#include "gtest/gtest.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-TEST(COWPtr, all) {
-  COWPtr<int> ptr(new int{0});
-  ASSERT_EQ(ptr.Data(), 0);
-  COWPtr<int> ptr2 = ptr;
-  ASSERT_EQ(ptr2.Data(), 0);
-  ASSERT_EQ(&ptr2.Data(), &ptr.Data());
-  *ptr2.MutableData() = 10;
-  ASSERT_EQ(ptr.Data(), 0);
-  ASSERT_EQ(ptr2.Data(), 10);
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/details/op_registry.h b/paddle/framework/details/op_registry.h
deleted file mode 100644
index 31a40bcbcb3905f01aebefe89526f3cfba8cb8c7..0000000000000000000000000000000000000000
--- a/paddle/framework/details/op_registry.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/grad_op_desc_maker.h"
-#include "paddle/framework/op_info.h"
-#include "paddle/framework/op_proto_maker.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/var_type_inference.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-enum OpInfoFillType {
-  kOperator = 0,
-  kOpProtoAndCheckerMaker = 1,
-  kGradOpDescMaker = 2,
-  kVarTypeInference = 3,
-  kShapeInference = 4
-};
-
-template <typename T>
-struct OpInfoFillTypeID {
-  static constexpr OpInfoFillType ID() {
-    return std::is_base_of<OperatorBase, T>::value
-               ? kOperator
-               : (std::is_base_of<OpProtoAndCheckerMaker, T>::value
-                      ? kOpProtoAndCheckerMaker
-                      : (std::is_base_of<GradOpDescMakerBase, T>::value
-                             ? kGradOpDescMaker
-                             : (std::is_base_of<VarTypeInference, T>::value
-                                    ? kVarTypeInference
-                                    : (std::is_base_of<InferShapeBase, T>::value
-                                           ? kShapeInference
-                                           : static_cast<OpInfoFillType>(
-                                                 -1)))));
-  }
-};
-
-template <typename T, OpInfoFillType = OpInfoFillTypeID<T>::ID()>
-struct OpInfoFiller;
-
-template <size_t I, bool at_end, typename... ARGS>
-class OperatorRegistrarRecursive;
-
-template <size_t I, typename... ARGS>
-class OperatorRegistrarRecursive<I, false, ARGS...> {
- public:
-  using T = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
-  OperatorRegistrarRecursive(const char* op_type, OpInfo* info) {
-    OpInfoFiller<T> fill;
-    fill(op_type, info);
-    constexpr auto size = sizeof...(ARGS);
-    OperatorRegistrarRecursive<I + 1, I + 1 == size, ARGS...> reg(op_type,
-                                                                  info);
-    (void)(reg);
-  }
-};
-
-template <size_t I, typename... ARGS>
-class OperatorRegistrarRecursive<I, true, ARGS...> {
- public:
-  OperatorRegistrarRecursive(const char* op_type, OpInfo* info) {}
-};
-
-template <typename T>
-struct OpInfoFiller<T, kOperator> {
-  void operator()(const char* op_type, OpInfo* info) const {
-    info->creator_ = [](const std::string& type, const VariableNameMap& inputs,
-                        const VariableNameMap& outputs,
-                        const AttributeMap& attrs) {
-      return new T(type, inputs, outputs, attrs);
-    };
-  }
-};
-
-template <typename T>
-struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
-  void operator()(const char* op_type, OpInfo* info) const {
-    info->proto_ = new proto::OpProto;
-    info->checker_ = new OpAttrChecker();
-    auto maker = T(info->proto_, info->checker_);
-    maker.Validate();
-    info->proto_->set_type(op_type);
-    PADDLE_ENFORCE(
-        info->proto_->IsInitialized(),
-        "Fail to initialize %s's OpProto, because %s is not initialized",
-        op_type, info->proto_->InitializationErrorString());
-  }
-};
-
-template <typename T>
-struct OpInfoFiller<T, kGradOpDescMaker> {
-  void operator()(const char* op_type, OpInfo* info) const {
-    info->grad_op_maker_ = [](
-        const OpDesc& fwd_op,
-        const std::unordered_set<std::string>& no_grad_set,
-        std::unordered_map<std::string, std::string>* grad_to_var,
-        const std::vector<BlockDesc*>& grad_block) {
-      T maker(fwd_op, no_grad_set, grad_to_var, grad_block);
-      return maker();
-    };
-  }
-};
-
-template <typename T>
-struct OpInfoFiller<T, kVarTypeInference> {
-  void operator()(const char* op_type, OpInfo* info) const {
-    info->infer_var_type_ = [](const OpDesc& fwd_op, BlockDesc* block) {
-      T inference;
-      inference(fwd_op, block);
-    };
-  }
-};
-
-template <typename T>
-struct OpInfoFiller<T, kShapeInference> {
-  void operator()(const char* op_type, OpInfo* info) const {
-    info->infer_shape_ = [](InferShapeContext* ctx) {
-      T inference;
-      inference(ctx);
-    };
-  }
-};
-
-}  // namespace details
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/details/unbuffered_channel.h b/paddle/framework/details/unbuffered_channel.h
deleted file mode 100644
index 92a16b4d22bbb6a8c75157444aa8474f700603fe..0000000000000000000000000000000000000000
--- a/paddle/framework/details/unbuffered_channel.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <atomic>
-#include <condition_variable>
-#include <mutex>
-
-#include "paddle/framework/channel.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-template <typename T>
-class UnBuffered : public paddle::framework::Channel<T> {
-  friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
-  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
-
- public:
-  virtual bool Send(T*);
-  virtual bool Receive(T*);
-  virtual size_t Cap() { return 0; }
-  virtual void Close();
-  virtual ~UnBuffered();
-
- private:
-  std::mutex mu_ch_;
-  // Mutex for readers and writers who are waiting for other reader
-  // and writer to complete execution
-  std::recursive_mutex mu_read_, mu_write_;
-  // reader_found_ is set true when a reader is ready to accept data
-  // writer_found_ is set true when a writer is ready to send data
-  // A transaction occurs only when both are true
-  std::atomic<bool> reader_found_{false}, writer_found_{false};
-  std::condition_variable cv_channel_;
-  std::condition_variable_any cv_reader_, cv_writer_, cv_destructor_;
-  T* item{nullptr};
-  std::atomic<bool> closed_{false};
-  std::atomic<unsigned> send_ctr{0};
-  std::atomic<unsigned> recv_ctr{0};
-
-  UnBuffered() : closed_(false) {}
-
-  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
-};
-
-// This function implements the concept of how data should
-// be sent from a writer to a reader.
-template <typename T>
-bool UnBuffered<T>::Send(T* data) {
-  bool ret = false;
-  if (closed_) {
-    return ret;
-  }
-  send_ctr++;
-  // Prevent other writers from entering
-  std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
-  writer_found_ = true;
-  std::unique_lock<std::recursive_mutex> cv_lock(mu_write_);
-  // If writer comes first, it should wait till a reader arrives
-  cv_writer_.wait(cv_lock,
-                  [this]() { return reader_found_ == true || closed_; });
-  cv_reader_.notify_one();
-  if (!closed_) {
-    std::unique_lock<std::mutex> channel_lock(mu_ch_);
-    item = data;
-    channel_lock.unlock();
-    cv_channel_.notify_one();
-    channel_lock.lock();
-    cv_channel_.wait(channel_lock,
-                     [this]() { return item == nullptr || closed_; });
-    ret = true;
-  }
-  writer_found_ = false;
-  send_ctr--;
-  cv_destructor_.notify_one();
-  return ret;
-}
-
-// This function implements the concept of how
-// data that was sent by a writer is read from a reader.
-template <typename T>
-bool UnBuffered<T>::Receive(T* data) {
-  bool ret = false;
-  // If channel is closed, we don't even want any reader to enter.
-  // Unlike a buffered channel, an unbuffered channel does not allow
-  // readers to read after closing because there is no buffer to be consumed.
-  if (closed_) return ret;
-  recv_ctr++;
-  // Prevent other readers from entering
-  std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
-  reader_found_ = true;
-  std::unique_lock<std::recursive_mutex> cv_lock{mu_read_};
-  // If reader comes first, it should wait till a writer arrives
-  cv_reader_.wait(cv_lock,
-                  [this]() { return writer_found_ == true || closed_; });
-  cv_writer_.notify_one();
-  if (!closed_) {
-    std::unique_lock<std::mutex> lock_ch{mu_ch_};
-    // Reader should wait for the writer to first write its data
-    cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; });
-    if (!closed_) {
-      *data = std::move(*item);
-      item = nullptr;
-      lock_ch.unlock();
-      ret = true;
-    }
-    cv_channel_.notify_one();
-  }
-  reader_found_ = false;
-  recv_ctr--;
-  cv_destructor_.notify_one();
-  return ret;
-}
-
-// This function implements the sequence of events
-// that take place once the channel is closed.
-template <typename T>
-void UnBuffered<T>::Close() {
-  if (closed_) {
-    return;
-  }
-  std::unique_lock<std::mutex> lock(mu_ch_);
-  item = nullptr;
-  closed_ = true;
-  NotifyAllParticipants(&lock);
-}
-
-// This function implements the sequence of events
-// that are executed once the object of an UnBuffered
-// channel is destroyed.
-template <typename T>
-UnBuffered<T>::~UnBuffered() {
-  std::unique_lock<std::mutex> lock(mu_ch_);
-  item = nullptr;
-  closed_ = true;
-  NotifyAllParticipants(&lock);
-  lock.lock();
-  cv_destructor_.wait(lock,
-                      [this]() { return send_ctr == 0 && recv_ctr == 0; });
-}
-
-// This function notifies all the readers, writers and
-// the channel condition variables.
-template <typename T>
-void UnBuffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
-  lock->unlock();
-  cv_writer_.notify_all();
-  cv_channel_.notify_all();
-  cv_reader_.notify_all();
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/dim.h b/paddle/framework/dim.h
deleted file mode 100644
index ec17d7c6156351d21a4f9431f85fb0bcf00e4331..0000000000000000000000000000000000000000
--- a/paddle/framework/dim.h
+++ /dev/null
@@ -1,421 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <type_traits>
-
-#include "paddle/platform/assert.h"
-#include "paddle/platform/hostdevice.h"
-
-namespace paddle {
-namespace framework {
-
-// Statically sized, statically indexed dimension
-template <int i>
-struct Dim {
-  static constexpr int dimensions = i;
-
-  template <typename... Args>
-  HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
-    static_assert(sizeof...(_tail) == i - 1,
-                  "Dim initialized with the wrong number of parameters");
-  }
-
-  HOSTDEVICE
-  Dim(int64_t _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {}
-
-  HOSTDEVICE
-  Dim() : head(0), tail() {}
-
-  /** Construct a Dim from a linear index and size.  Uses Fortran order
-   * indexing. */
-  HOSTDEVICE
-  Dim(int64_t idx, const Dim<i>& size)
-      : head(idx % size.head), tail(idx / size.head, size.tail) {}
-
-  /** Construct a Dim with each dimension set to the given index */
-  HOSTDEVICE
-  Dim(int64_t idx) : head(idx), tail(idx) {}
-
-  HOSTDEVICE
-  bool operator==(const Dim<i>& o) const {
-    return (head == o.head) && (tail == o.tail);
-  }
-
-  HOSTDEVICE
-  bool operator!=(const Dim<i>& o) const { return !(*this == o); }
-
-  HOSTDEVICE
-  int64_t& operator[](int idx);
-  HOSTDEVICE
-  int64_t operator[](int idx) const;
-
-  HOST std::string to_string() const;
-
-  int64_t head;
-  Dim<i - 1> tail;
-};
-
-// Base case specialization
-template <>
-struct Dim<1> {
-  static constexpr int dimensions = 1;
-
-  HOSTDEVICE
-  Dim(int64_t _head) : head(_head) {}
-
-  HOSTDEVICE
-  Dim() : head(0) {}
-
-  HOSTDEVICE
-  Dim(int idx, const Dim<1>& size) : head(idx) {
-#ifndef __CUDA_ARCH__
-    if (idx >= size.head) {
-      throw std::invalid_argument("Index out of range.");
-    }
-#else
-    PADDLE_ASSERT(idx < size.head);
-#endif
-  }
-
-  HOSTDEVICE
-  bool operator==(const Dim<1>& o) const { return (head == o.head); }
-
-  HOSTDEVICE
-  bool operator!=(const Dim<1>& o) const { return !(*this == o); }
-
-  HOSTDEVICE
-  int64_t& operator[](int idx);
-  HOSTDEVICE
-  int64_t operator[](int idx) const;
-
-  int64_t head;
-};
-
-namespace {
-
-// Helper for accessing Dim classes
-template <int i>
-struct DimGetter {
-  // Return a copy if Dim is const
-  template <typename D>
-  HOSTDEVICE static int64_t impl(const D& d) {
-    return DimGetter<i - 1>::impl(d.tail);
-  }
-  // Return a reference if Dim is mutable
-  template <typename D>
-  HOSTDEVICE static int64_t& impl(D& d) {
-    return DimGetter<i - 1>::impl(d.tail);
-  }
-};
-
-// Eureka! We found the element!
-template <>
-struct DimGetter<0> {
-  // Return a copy if Dim is const
-  template <typename D>
-  HOSTDEVICE static int64_t impl(const D& d) {
-    return d.head;
-  }
-  // Return a reference if Dim is mutable
-  template <typename D>
-  HOSTDEVICE static int64_t& impl(D& d) {
-    return d.head;
-  }
-};
-
-template <int D>
-HOSTDEVICE int64_t& indexer(Dim<D>& dim, int idx) {
-#ifndef __CUDA_ARCH__
-  if (idx < 0) {
-    throw std::invalid_argument("Tried to access a negative dimension");
-  }
-#else
-  PADDLE_ASSERT(idx >= 0);
-#endif
-  if (idx == 0) {
-    return dim.head;
-  }
-  return indexer(dim.tail, idx - 1);
-}
-
-template <>
-HOSTDEVICE int64_t& indexer<1>(Dim<1>& dim, int idx) {
-#ifndef __CUDA_ARCH__
-  if (idx != 0) {
-    throw std::invalid_argument("Invalid index");
-  }
-#else
-  PADDLE_ASSERT(idx == 0);
-#endif
-  return dim.head;
-}
-
-template <int D>
-HOSTDEVICE int64_t indexer(const Dim<D>& dim, int idx) {
-#ifndef __CUDA_ARCH__
-  if (idx < 0) {
-    throw std::invalid_argument("Tried to access a negative dimension");
-  }
-#else
-  PADDLE_ASSERT(idx >= 0);
-#endif
-  if (idx == 0) {
-    return dim.head;
-  }
-  return indexer(dim.tail, idx - 1);
-}
-
-template <>
-HOSTDEVICE int64_t indexer<1>(const Dim<1>& dim, int idx) {
-#ifndef __CUDA_ARCH__
-  if (idx != 0) {
-    throw std::invalid_argument("Invalid index");
-  }
-#else
-  PADDLE_ASSERT(idx == 0);
-#endif
-  return dim.head;
-}
-
-}  // namespace
-// Static access to constant Dim
-template <int i, int l>
-HOSTDEVICE int64_t get(const Dim<l>& d) {
-  return DimGetter<i>::impl(d);
-}
-
-// Static access to mutable Dim
-template <int i, int l>
-HOSTDEVICE int64_t& get(Dim<l>& d) {
-  return DimGetter<i>::impl(d);
-}
-
-// Dynamic access to constant Dim
-template <int l>
-HOSTDEVICE int64_t Dim<l>::operator[](int i) const {
-  return indexer(*this, i);
-}
-
-// Dynamic access to mutable Dim
-template <int l>
-HOSTDEVICE int64_t& Dim<l>::operator[](int i) {
-  return indexer(*this, i);
-}
-
-// Dynamic access to constant Dim
-inline HOSTDEVICE int64_t Dim<1>::operator[](int i) const {
-  return indexer(*this, i);
-}
-
-// Dynamic access to mutable Dim
-inline HOSTDEVICE int64_t& Dim<1>::operator[](int i) {
-  return indexer(*this, i);
-}
-
-// Dynamic access to constant Dim
-// without std::enable_if will try to instantiate this on get<0>(d)
-template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l>& d,
-                                                               int i) {
-  return d[i];
-}
-
-// Dynamic access to mutable Dim
-template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int64_t&>::type get(Dim<l>& d,
-                                                                int i) {
-  return d[i];
-}
-
-// Dot product of two dims
-template <int i>
-HOSTDEVICE int64_t linearize(const Dim<i>& a, const Dim<i>& b) {
-  return a.head * b.head + linearize(a.tail, b.tail);
-}
-
-// Base case dot product of two Dims
-// Notice it is inline because it is no longer a template
-template <>
-HOSTDEVICE inline int64_t linearize(const Dim<1>& a, const Dim<1>& b) {
-  return a.head * b.head;
-}
-
-// Product of a Dim
-template <int i>
-HOSTDEVICE int64_t product(const Dim<i>& a, int prod = 1) {
-  return prod * a.head * product(a.tail);
-}
-
-// Base case product of a Dim
-// Notice it is inline because it is no longer a template
-template <>
-HOSTDEVICE inline int64_t product(const Dim<1>& a, int prod) {
-  return prod * a.head;
-}
-
-// Is 0 <= idx_i < size_i for all i?
-template <int i>
-HOSTDEVICE bool contained(const Dim<i>& idx, const Dim<i>& size) {
-  return ((0 <= idx.head) && (idx.head < size.head) &&
-          contained(idx.tail, size.tail));
-}
-
-// Base case of is 0 <= idx_i < size_i ?
-// Notice it is inline because it is no longer a template
-template <>
-HOSTDEVICE inline bool contained(const Dim<1>& idx, const Dim<1>& size) {
-  return ((0 <= idx.head) && (idx.head < size.head));
-}
-
-/**
- * \brief Compute exclusive prefix-multiply of a Dim.
- */
-template <int i>
-HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i>& src, int mul = 1) {
-  return Dim<i>(mul, ex_prefix_mul(src.tail, mul * src.head));
-}
-
-///\cond HIDDEN
-// Base case of ex_prefix_mul
-// Notice it is inline because it is no longer a template
-template <>
-HOSTDEVICE inline Dim<1> ex_prefix_mul(const Dim<1>& src, int mul) {
-  return Dim<1>(mul);
-}
-///\endcond
-
-/**
- * Add two dimensions together
- */
-template <int i>
-HOSTDEVICE Dim<i> dim_plus(const Dim<i>& a, const Dim<i>& b) {
-  return Dim<i>(a.head + b.head, dim_plus(a.tail, b.tail));
-}
-
-// Base case
-template <>
-HOSTDEVICE inline Dim<1> dim_plus(const Dim<1>& a, const Dim<1>& b) {
-  return Dim<1>(a.head + b.head);
-}
-
-template <int i>
-HOSTDEVICE Dim<i> operator+(const Dim<i>& lhs, const Dim<i>& rhs) {
-  return dim_plus(lhs, rhs);
-}
-
-/**
- * Multiply two dimensions together
- */
-template <int i>
-HOSTDEVICE Dim<i> dim_mult(const Dim<i>& a, const Dim<i>& b) {
-  return Dim<i>(a.head * b.head, dim_mult(a.tail, b.tail));
-}
-
-// Base case
-template <>
-HOSTDEVICE inline Dim<1> dim_mult(const Dim<1>& a, const Dim<1>& b) {
-  return Dim<1>(a.head * b.head);
-}
-
-template <int i>
-HOSTDEVICE Dim<i> operator*(const Dim<i>& lhs, const Dim<i>& rhs) {
-  return dim_mult(lhs, rhs);
-}
-
-/**
- * \brief Normalize strides to ensure any dimension with extent 1
- * has stride 0.
- *
- * \param size Dim object containing the size of an array
- * \param stride Dim object containing stride of an array
- * \return Dim object the same size as \p size with normalized strides
- *
- */
-
-template <int i>
-HOSTDEVICE Dim<i> normalize_strides(const Dim<i>& size, const Dim<i>& stride) {
-  int norm_stride = size.head == 1 ? 0 : stride.head;
-  return Dim<i>(norm_stride, normalize_strides(size.tail, stride.tail));
-}
-
-///\cond HIDDEN
-
-template <>
-HOSTDEVICE inline Dim<1> normalize_strides(const Dim<1>& size,
-                                           const Dim<1>& stride) {
-  int norm_stride = size.head == 1 ? 0 : stride.head;
-  return Dim<1>(norm_stride);
-}
-
-///\endcond
-
-/**
- * Helper function to create a Dim
- *
- * \param idxes The type of Dim constructed depends on the number of params
- *
- */
-
-template <typename... Args>
-HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) {
-  return Dim<sizeof...(Args)>(idxes...);
-}
-
-// Allows us to output a Dim
-// XXX For some reason, overloading fails to resolve this correctly
-template <int i>
-typename std::enable_if<(i > 1), std::ostream&>::type operator<<(
-    std::ostream& os, const Dim<i>& d) {
-  os << d.head << ", " << d.tail;
-  return os;
-}
-
-// Base case that allows us to output a Dim
-// XXX I wish this could be an overload instead of a template
-template <int i>
-typename std::enable_if<(i == 1), std::ostream&>::type operator<<(
-    std::ostream& os, const Dim<i>& d) {
-  os << d.head;
-  return os;
-}
-
-template <int i>
-HOST std::string Dim<i>::to_string() const {
-  std::stringstream stream;
-
-  stream << *this;
-
-  return stream.str();
-}
-
-template <int D>
-HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
-  Dim<D> result;
-
-  for (int i = 0; i < D - 1; ++i) {
-    result[i] = linear_index % extents[i];
-    linear_index /= extents[i];
-  }
-
-  result[D - 1] = linear_index;
-
-  return result;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/dim_test.cu b/paddle/framework/dim_test.cu
deleted file mode 100644
index 2bcab7c5c2e454e86a148fde003164d369c46ef2..0000000000000000000000000000000000000000
--- a/paddle/framework/dim_test.cu
+++ /dev/null
@@ -1,114 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <thrust/device_vector.h>
-#include <sstream>
-
-#include "gtest/gtest.h"
-#include "paddle/framework/dim.h"
-
-__global__ void test(paddle::framework::Dim<2>* o) {
-  o[0] = paddle::framework::make_dim(5, 6);
-}
-
-__global__ void dyn_idx_gpu(int64_t* o) {
-  auto d = paddle::framework::make_dim(5, 6);
-  o[0] = d[1];
-}
-
-TEST(Dim, Equality) {
-  // construct a Dim on the CPU
-  auto a = paddle::framework::make_dim(3, 4);
-  EXPECT_EQ(paddle::framework::get<0>(a), 3);
-  EXPECT_EQ(paddle::framework::get<1>(a), 4);
-
-  // construct a Dim on the GPU
-  thrust::device_vector<paddle::framework::Dim<2>> t(2);
-  test<<<1, 1>>>(thrust::raw_pointer_cast(t.data()));
-  a = t[0];
-  EXPECT_EQ(paddle::framework::get<0>(a), 5);
-  EXPECT_EQ(paddle::framework::get<1>(a), 6);
-
-  // linearization
-  auto b = paddle::framework::make_dim(7, 8);
-  EXPECT_EQ(paddle::framework::linearize(a, b), 83);
-
-  // product
-  EXPECT_EQ(paddle::framework::product(a), 30);
-
-  // mutate a Dim
-  paddle::framework::get<1>(b) = 10;
-  EXPECT_EQ(paddle::framework::get<0>(b), 7);
-  EXPECT_EQ(paddle::framework::get<1>(b), 10);
-
-  // dynamic access
-  paddle::framework::get(b, 0) = 8;
-  b[1] = 11;
-  EXPECT_EQ(paddle::framework::get<0>(b), 8);
-  EXPECT_EQ(paddle::framework::get<1>(b), 11);
-  EXPECT_EQ(paddle::framework::get(b, 0), 8);
-  EXPECT_EQ(b[1], 11);
-
-  // dynamic access on GPU
-  thrust::device_vector<int64_t> r(1);
-  dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
-  int64_t res = r[0];
-  EXPECT_EQ(res, 6);
-
-  // ex_prefix_mul
-  paddle::framework::Dim<3> c =
-      paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
-  EXPECT_EQ(paddle::framework::get<0>(c), 1);
-  EXPECT_EQ(paddle::framework::get<1>(c), 3);
-  EXPECT_EQ(paddle::framework::get<2>(c), 12);
-
-  // generate from an index
-  auto size = paddle::framework::make_dim(4, 5, 2);
-  c = paddle::framework::Dim<3>(14, size);
-  EXPECT_EQ(paddle::framework::get<0>(c), 2);
-  EXPECT_EQ(paddle::framework::get<1>(c), 3);
-  EXPECT_EQ(paddle::framework::get<2>(c), 0);
-  c = paddle::framework::Dim<3>(25, size);
-  EXPECT_EQ(paddle::framework::get<0>(c), 1);
-  EXPECT_EQ(paddle::framework::get<1>(c), 1);
-  EXPECT_EQ(paddle::framework::get<2>(c), 1);
-}
-
-TEST(Dim, Bool) {
-  auto a = paddle::framework::make_dim(3, 4);
-  auto b = paddle::framework::make_dim(5, 6);
-  auto c = paddle::framework::make_dim(3, 4);
-
-  // in_bounds check
-  EXPECT_TRUE(paddle::framework::contained(a, b));
-  EXPECT_FALSE(paddle::framework::contained(b, a));
-
-  // comparison
-  EXPECT_TRUE(a == a);
-  EXPECT_FALSE(a == b);
-  EXPECT_TRUE(a == c);
-}
-
-TEST(Dim, Print) {
-  {
-    std::stringstream ss;
-    auto a = paddle::framework::make_dim(2, 3);
-    ss << a;
-    EXPECT_EQ(ss.str(), "2, 3");
-  }
-  {
-    std::stringstream ss;
-    ss << paddle::framework::make_dim(8);
-    EXPECT_EQ(ss.str(), "8");
-  }
-}
diff --git a/paddle/framework/eigen.h b/paddle/framework/eigen.h
deleted file mode 100644
index 54bbeafcabdeeb1e2c1017c156b3512c83dada3a..0000000000000000000000000000000000000000
--- a/paddle/framework/eigen.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/tensor.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace paddle {
-namespace framework {
-
-// EigenDim converts paddle::platform::DDim into Eigen::DSizes.
-template <int D>
-struct EigenDim {
-  using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
-
-  static Type From(const DDim& dims) {
-    PADDLE_ENFORCE(arity(dims) == D, "D must match arity(DDim)");
-    Type ret;
-    for (int64_t d = 0; d < arity(dims); d++) {
-      ret[d] = dims[d];
-    }
-    return ret;
-  }
-};
-
-// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor.
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-struct EigenTensor {
-  // TODO(qijun) Now, default type in unaligned, and we will make a benchmark on
-  // the speed of aligned and unaligned version in future.
-  using Type = Eigen::TensorMap<Eigen::Tensor<T, D, MajorType, IndexType>>;
-
-  using ConstType =
-      Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
-
-  static Type From(Tensor& tensor, DDim dims) {
-    return Type(tensor.data<T>(), EigenDim<D>::From(dims));
-  }
-
-  static Type From(Tensor& tensor) { return From(tensor, tensor.dims_); }
-
-  static ConstType From(const Tensor& tensor, DDim dims) {
-    return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
-  }
-
-  static ConstType From(const Tensor& tensor) {
-    return From(tensor, tensor.dims_);
-  }
-};
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
-  static typename EigenMatrix::Type Reshape(Tensor& tensor, int num_col_dims) {
-    int rank = tensor.dims_.size();
-    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
-                   "`num_col_dims` must be between (0, rank_of_tensor).");
-    return EigenMatrix::From(tensor,
-                             flatten_to_2d(tensor.dims(), num_col_dims));
-  }
-
-  static typename EigenMatrix::ConstType Reshape(const Tensor& tensor,
-                                                 int num_col_dims) {
-    int rank = tensor.dims_.size();
-    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
-                   "`num_col_dims` must be between (0, rank_of_tensor).");
-    return EigenMatrix::From(tensor,
-                             flatten_to_2d(tensor.dims(), num_col_dims));
-  }
-};
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
-  // Flatten reshapes a Tensor into an EigenVector.
-  static typename EigenVector::Type Flatten(Tensor& tensor) {
-    return EigenVector::From(tensor, {product(tensor.dims_)});
-  }
-
-  static typename EigenVector::ConstType Flatten(const Tensor& tensor) {
-    return EigenVector::From(tensor, {product(tensor.dims_)});
-  }
-};
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-struct EigenScalar {
-  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
-  using Type = Eigen::TensorMap<
-      Eigen::TensorFixedSize<T, Eigen::Sizes<>, MajorType, IndexType>>;
-  using ConstType = Eigen::TensorMap<
-      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, MajorType, IndexType>>;
-
-  static Type From(Tensor& tensor) { return Type(tensor.data<T>()); }
-
-  static ConstType From(const Tensor& tensor) {
-    return ConstType(tensor.data<T>());
-  }
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/eigen_test.cc b/paddle/framework/eigen_test.cc
deleted file mode 100644
index 9e368a522ce71d73213ba4c781e8a56fad917b0a..0000000000000000000000000000000000000000
--- a/paddle/framework/eigen_test.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/framework/eigen.h"
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace framework {
-
-TEST(EigenDim, From) {
-  EigenDim<3>::Type ed = EigenDim<3>::From(make_ddim({1, 2, 3}));
-  ASSERT_EQ(1, ed[0]);
-  ASSERT_EQ(2, ed[1]);
-  ASSERT_EQ(3, ed[2]);
-}
-
-TEST(Eigen, Tensor) {
-  Tensor t;
-  float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
-  for (int i = 0; i < 1 * 2 * 3; i++) {
-    p[i] = static_cast<float>(i);
-  }
-
-  EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
-
-  ASSERT_EQ(1, et.dimension(0));
-  ASSERT_EQ(2, et.dimension(1));
-  ASSERT_EQ(3, et.dimension(2));
-
-  for (int i = 0; i < 1; i++) {
-    for (int j = 0; j < 2; j++) {
-      for (int k = 0; k < 3; k++) {
-        ASSERT_NEAR((i * 2 + j) * 3 + k, et(i, j, k), 1e-6f);
-      }
-    }
-  }
-}
-
-TEST(Eigen, ScalarFrom) {
-  Tensor t;
-  int* p = t.mutable_data<int>(make_ddim({1}), platform::CPUPlace());
-  *p = static_cast<int>(100);
-
-  EigenScalar<int>::Type es = EigenScalar<int>::From(t);
-
-  ASSERT_EQ(0, es.dimension(0));
-  ASSERT_EQ(100, es(0));
-}
-
-TEST(Eigen, VectorFrom) {
-  Tensor t;
-  float* p = t.mutable_data<float>(make_ddim({6}), platform::CPUPlace());
-  for (int i = 0; i < 6; i++) {
-    p[i] = static_cast<float>(i);
-  }
-
-  EigenVector<float>::Type ev = EigenVector<float>::From(t);
-
-  ASSERT_EQ(6, ev.dimension(0));
-
-  for (int i = 0; i < 6; i++) {
-    ASSERT_NEAR(i, ev(i), 1e-6f);
-  }
-}
-
-TEST(Eigen, VectorFlatten) {
-  Tensor t;
-  float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
-  for (int i = 0; i < 1 * 2 * 3; i++) {
-    p[i] = static_cast<float>(i);
-  }
-
-  EigenVector<float>::Type ev = EigenVector<float>::Flatten(t);
-
-  ASSERT_EQ(1 * 2 * 3, ev.dimension(0));
-
-  for (int i = 0; i < 1 * 2 * 3; i++) {
-    ASSERT_NEAR(i, ev(i), 1e-6f);
-  }
-}
-
-TEST(Eigen, Matrix) {
-  Tensor t;
-  float* p = t.mutable_data<float>(make_ddim({2, 3}), platform::CPUPlace());
-  for (int i = 0; i < 2 * 3; i++) {
-    p[i] = static_cast<float>(i);
-  }
-
-  EigenMatrix<float>::Type em = EigenMatrix<float>::From(t);
-
-  ASSERT_EQ(2, em.dimension(0));
-  ASSERT_EQ(3, em.dimension(1));
-
-  for (int i = 0; i < 2; i++) {
-    for (int j = 0; j < 3; j++) {
-      ASSERT_NEAR(i * 3 + j, em(i, j), 1e-6f);
-    }
-  }
-}
-
-TEST(Eigen, MatrixReshape) {
-  Tensor t;
-  float* p = t.mutable_data<float>({2, 3, 6, 4}, platform::CPUPlace());
-  for (int i = 0; i < 2 * 3 * 6 * 4; ++i) {
-    p[i] = static_cast<float>(i);
-  }
-
-  EigenMatrix<float>::Type em = EigenMatrix<float>::Reshape(t, 2);
-
-  ASSERT_EQ(2 * 3, em.dimension(0));
-  ASSERT_EQ(6 * 4, em.dimension(1));
-
-  for (int i = 0; i < 2 * 3; i++) {
-    for (int j = 0; j < 6 * 4; j++) {
-      ASSERT_NEAR(i * 6 * 4 + j, em(i, j), 1e-6f);
-    }
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
deleted file mode 100644
index c604fdcc7b1fe3b16fc48edf4c13445cf26feb62..0000000000000000000000000000000000000000
--- a/paddle/framework/executor.cc
+++ /dev/null
@@ -1,315 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/executor.h"
-
-#include <set>
-
-#include "gflags/gflags.h"
-#include "paddle/framework/feed_fetch_method.h"
-#include "paddle/framework/feed_fetch_type.h"
-#include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/reader.h"
-#include "paddle/operators/nccl/nccl_gpu_common.h"  // platform::Communicator
-#include "paddle/platform/place.h"
-#include "paddle/platform/profiler.h"
-
-DECLARE_bool(benchmark);
-DEFINE_bool(check_nan_inf, false,
-            "Checking whether operator produce NAN/INF or not. It will be "
-            "extremely slow so please use this flag wisely.");
-
-namespace paddle {
-namespace framework {
-
-Executor::Executor(const platform::Place& place) : place_(place) {}
-
-static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
-  if (var_type == proto::VarDesc::LOD_TENSOR) {
-    var->GetMutable<LoDTensor>();
-  } else if (var_type == proto::VarDesc::SELECTED_ROWS) {
-    var->GetMutable<SelectedRows>();
-  } else if (var_type == proto::VarDesc::FEED_MINIBATCH) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarDesc::FETCH_LIST) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarDesc::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope>>();
-  } else if (var_type == proto::VarDesc::LOD_RANK_TABLE) {
-    var->GetMutable<LoDRankTable>();
-  } else if (var_type == proto::VarDesc::LOD_TENSOR_ARRAY) {
-    var->GetMutable<LoDTensorArray>();
-  } else if (var_type == proto::VarDesc::PLACE_LIST) {
-    var->GetMutable<platform::PlaceList>();
-  } else if (var_type == proto::VarDesc::NCCL_COM) {
-    var->GetMutable<platform::Communicator>();
-  } else if (var_type == proto::VarDesc::READER) {
-    var->GetMutable<ReaderHolder>();
-  } else {
-    PADDLE_THROW(
-        "Variable type %d is not in "
-        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER]",
-        var_type);
-  }
-}
-
-static void CheckTensorNANOrInf(const std::string& name,
-                                const framework::Tensor& tensor) {
-  if (tensor.memory_size() == 0) {
-    return;
-  }
-  if (tensor.type().hash_code() != typeid(float).hash_code() &&
-      tensor.type().hash_code() != typeid(double).hash_code()) {
-    return;
-  }
-  PADDLE_ENFORCE(!framework::HasInf(tensor), "Tensor %s has Inf", name);
-  PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN", name);
-}
-
-void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
-                   bool create_local_scope, bool create_vars) {
-  // TODO(tonyyang-svail):
-  //    - only runs on the first device (i.e. no interdevice communication)
-  //    - will change to use multiple blocks for RNN op and Cond Op
-  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), pdesc.Size());
-  auto& block = pdesc.Block(block_id);
-
-  Scope* local_scope = scope;
-  if (create_vars) {
-    if (create_local_scope) {
-      local_scope = &scope->NewScope();
-      for (auto& var : block.AllVars()) {
-        if (var->Name() == framework::kEmptyVarName) {
-          continue;
-        }
-
-        if (var->Persistable()) {
-          auto* ptr = scope->Var(var->Name());
-          CreateTensor(ptr, var->GetType());
-          VLOG(3) << "Create Variable " << var->Name()
-                  << " global, which pointer is " << ptr;
-        } else {
-          auto* ptr = local_scope->Var(var->Name());
-          CreateTensor(ptr, var->GetType());
-          VLOG(3) << "Create Variable " << var->Name()
-                  << " locally, which pointer is " << ptr;
-        }
-      }
-    } else {
-      for (auto& var : block.AllVars()) {
-        auto* ptr = local_scope->Var(var->Name());
-        CreateTensor(ptr, var->GetType());
-        VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
-                << ptr;
-      }
-    }  // if (create_local_scope)
-  }    // if (create_vars)
-
-  for (auto& op_desc : block.AllOps()) {
-    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
-    VLOG(3) << op->DebugStringEx(local_scope);
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    platform::RecordEvent record_event(op->Type(), pool.Get(place_));
-
-    op->Run(*local_scope, place_);
-    if (FLAGS_benchmark) {
-      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
-              << memory::memory_usage(place_);
-    }
-    if (FLAGS_check_nan_inf) {
-      for (auto& vname : op->OutputVars(true)) {
-        auto* var = local_scope->FindVar(vname);
-        if (var == nullptr) continue;
-        if (var->IsType<framework::LoDTensor>()) {
-          CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
-        }
-      }
-    }
-  }
-  if (create_vars && create_local_scope) {
-    scope->DeleteScope(local_scope);
-  }
-  if (FLAGS_benchmark) {
-    VLOG(2) << "-------------------------------------------------------";
-    VLOG(2) << "Memory used after deleting local scope: "
-            << memory::memory_usage(place_);
-    VLOG(2) << "-------------------------------------------------------";
-  }
-}
-
-// Check whether the block already has feed operators and feed_holder.
-// Return false if the block does not have any feed operators.
-// If some feed operators have been prepended to the block, check that
-// the info contained in these feed operators matches the feed_targets
-// and feed_holder_name. Raise exception when any mismatch is found.
-// Return true if the block has feed operators and holder of matching info.
-static bool has_feed_operators(
-    BlockDesc* block, std::map<std::string, const LoDTensor*>& feed_targets,
-    const std::string& feed_holder_name) {
-  size_t feed_count = 0;
-  for (auto* op : block->AllOps()) {
-    if (op->Type() == kFeedOpType) {
-      feed_count++;
-      PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name,
-                        "Input to feed op should be '%s'", feed_holder_name);
-      std::string feed_target_name = op->Output("Out")[0];
-      PADDLE_ENFORCE(
-          feed_targets.find(feed_target_name) != feed_targets.end(),
-          "Feed operator output name '%s' cannot be found in 'feed_targets'",
-          feed_target_name);
-    }
-  }
-
-  if (feed_count > 0) {
-    PADDLE_ENFORCE_EQ(
-        feed_count, feed_targets.size(),
-        "The number of feed operators should match 'feed_targets'");
-
-    // When feed operator are present, so should be feed_holder
-    auto var = block->FindVar(feed_holder_name);
-    PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
-                            feed_holder_name);
-    PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FEED_MINIBATCH,
-                      "'%s' variable should be 'FEED_MINIBATCH' type",
-                      feed_holder_name);
-  }
-
-  return feed_count > 0;
-}
-
-// Check whether the block already has fetch operators and fetch_holder.
-// Return false if the block does not have any fetch operators.
-// If some fetch operators have been appended to the block, check that
-// the info contained in these fetch operators matches the fetch_targets
-// and fetch_holder_name. Raise exception when any mismatch is found.
-// Return true if the block has fetch operators and holder of matching info.
-static bool has_fetch_operators(
-    BlockDesc* block, std::map<std::string, LoDTensor*>& fetch_targets,
-    const std::string& fetch_holder_name) {
-  size_t fetch_count = 0;
-  for (auto* op : block->AllOps()) {
-    if (op->Type() == kFetchOpType) {
-      fetch_count++;
-      PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name,
-                        "Output of fetch op should be '%s'", fetch_holder_name);
-      std::string fetch_target_name = op->Input("X")[0];
-      PADDLE_ENFORCE(
-          fetch_targets.find(fetch_target_name) != fetch_targets.end(),
-          "Fetch operator input name '%s' cannot be found in 'fetch_targets'",
-          fetch_target_name);
-    }
-  }
-
-  if (fetch_count > 0) {
-    PADDLE_ENFORCE_EQ(
-        fetch_count, fetch_targets.size(),
-        "The number of fetch operators should match 'fetch_targets'");
-
-    // When fetch operator are present, so should be fetch_holder
-    auto var = block->FindVar(fetch_holder_name);
-    PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
-                            fetch_holder_name);
-    PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FETCH_LIST,
-                      "'%s' variable should be 'FETCH_LIST' type",
-                      fetch_holder_name);
-  }
-
-  return fetch_count > 0;
-}
-
-void Executor::Run(const ProgramDesc& program, Scope* scope,
-                   std::map<std::string, const LoDTensor*>& feed_targets,
-                   std::map<std::string, LoDTensor*>& fetch_targets,
-                   const std::string& feed_holder_name,
-                   const std::string& fetch_holder_name) {
-  auto* copy_program = new ProgramDesc(program);
-  auto* global_block = copy_program->MutableBlock(0);
-
-  if (!has_feed_operators(global_block, feed_targets, feed_holder_name)) {
-    // create feed_holder variable
-    auto* feed_holder = global_block->Var(feed_holder_name);
-    feed_holder->SetType(proto::VarDesc::FEED_MINIBATCH);
-    feed_holder->SetPersistable(true);
-
-    int i = 0;
-    for (auto& feed_target : feed_targets) {
-      std::string var_name = feed_target.first;
-      VLOG(3) << "feed target's name: " << var_name;
-
-      // prepend feed op
-      auto* op = global_block->PrependOp();
-      op->SetType(kFeedOpType);
-      op->SetInput("X", {feed_holder_name});
-      op->SetOutput("Out", {var_name});
-      op->SetAttr("col", {static_cast<int>(i)});
-      op->CheckAttrs();
-
-      i++;
-    }
-  }
-
-  // map the data of feed_targets to feed_holder
-  for (auto* op : global_block->AllOps()) {
-    if (op->Type() == kFeedOpType) {
-      std::string feed_target_name = op->Output("Out")[0];
-      int idx = boost::get<int>(op->GetAttr("col"));
-      SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
-                      idx);
-    }
-  }
-
-  if (!has_fetch_operators(global_block, fetch_targets, fetch_holder_name)) {
-    // create fetch_holder variable
-    auto* fetch_holder = global_block->Var(fetch_holder_name);
-    fetch_holder->SetType(proto::VarDesc::FETCH_LIST);
-    fetch_holder->SetPersistable(true);
-
-    int i = 0;
-    for (auto& fetch_target : fetch_targets) {
-      std::string var_name = fetch_target.first;
-      VLOG(3) << "fetch target's name: " << var_name;
-
-      // append fetch op
-      auto* op = global_block->AppendOp();
-      op->SetType(kFetchOpType);
-      op->SetInput("X", {var_name});
-      op->SetOutput("Out", {fetch_holder_name});
-      op->SetAttr("col", {static_cast<int>(i)});
-      op->CheckAttrs();
-
-      i++;
-    }
-  }
-
-  Run(*copy_program, scope, 0, true, true);
-
-  // obtain the data of fetch_targets from fetch_holder
-  for (auto* op : global_block->AllOps()) {
-    if (op->Type() == kFetchOpType) {
-      std::string fetch_target_name = op->Input("X")[0];
-      int idx = boost::get<int>(op->GetAttr("col"));
-      *fetch_targets[fetch_target_name] =
-          GetFetchVariable(*scope, fetch_holder_name, idx);
-    }
-  }
-
-  delete copy_program;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
deleted file mode 100644
index 035ff48a52bd2fc4b1a46b48b1fbf1fbcb2ac70b..0000000000000000000000000000000000000000
--- a/paddle/framework/executor.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/op_info.h"
-#include "paddle/framework/program_desc.h"
-#include "paddle/framework/scope.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-
-class Executor {
- public:
-  // TODO(dzhwinter) : Do not rely on this function, it will be removed
-  explicit Executor(const platform::DeviceContext& device)
-      : Executor(device.GetPlace()) {}
-
-  explicit Executor(const platform::Place& place);
-
-  /* @Brief
-   * Runtime evaluation of the given ProgramDesc under certain Scope
-   *
-   * @param
-   *  ProgramDesc
-   *  Scope
-   */
-  void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true,
-           bool create_vars = true);
-
-  void Run(const ProgramDesc& program, Scope* scope,
-           std::map<std::string, const LoDTensor*>& feed_targets,
-           std::map<std::string, LoDTensor*>& fetch_targets,
-           const std::string& feed_holder_name = "feed",
-           const std::string& fetch_holder_name = "fetch");
-
- private:
-  const platform::Place place_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/feed_fetch_method.cc b/paddle/framework/feed_fetch_method.cc
deleted file mode 100644
index 21201b675519e34b11e9f1f3a6f2a135c06d63a7..0000000000000000000000000000000000000000
--- a/paddle/framework/feed_fetch_method.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/feed_fetch_method.h"
-#include "glog/logging.h"
-#include "paddle/framework/variable.h"
-
-namespace paddle {
-namespace framework {
-
-void SetFeedVariable(Scope* scope, const LoDTensor& input,
-                     const std::string& var_name, size_t index) {
-  // If var_name Variable is not found in GlobalScope, a new variable will
-  // be created.
-  VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
-  Variable* g_feed_value = scope->Var(var_name);
-  auto& feed_inputs =
-      *(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
-  if (index >= feed_inputs.size()) {
-    feed_inputs.resize(index + 1);
-  }
-  // shared data with input tensor
-  feed_inputs[index].ShareDataWith(input);
-  // set lod
-  feed_inputs[index].set_lod(input.lod());
-}
-
-LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
-                            size_t index) {
-  // Since we want to fetch LodTensor from a variable, the variable must
-  // be created alreadly.
-  Variable* g_fetch_value = scope.FindVar(var_name);
-  PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
-                 "Only %s can be invoked by GetFetchVariable",
-                 typeid(FeedFetchList).name());
-  auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
-  auto& tensor = fetch_outputs[index];
-  VLOG(3) << "Fetch " << var_name << " with index " << index
-          << " shape= " << tensor.dims();
-  PADDLE_ENFORCE_LT(index, fetch_outputs.size());
-  return tensor;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/feed_fetch_method.h b/paddle/framework/feed_fetch_method.h
deleted file mode 100644
index b71945fcc8834d2e5fe21151e1e88788b4acd5c1..0000000000000000000000000000000000000000
--- a/paddle/framework/feed_fetch_method.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/feed_fetch_type.h"
-#include "paddle/framework/scope.h"
-
-namespace paddle {
-namespace framework {
-
-void SetFeedVariable(Scope* scope, const LoDTensor& input,
-                     const std::string& var_name, size_t index);
-
-LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
-                            size_t index);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/feed_fetch_type.h b/paddle/framework/feed_fetch_type.h
deleted file mode 100644
index 168f456675af508df86dd0520cdeb5d16d94ad31..0000000000000000000000000000000000000000
--- a/paddle/framework/feed_fetch_type.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-#include "paddle/framework/lod_tensor.h"
-
-namespace paddle {
-namespace framework {
-using FeedFetchType = LoDTensor;
-using FeedFetchList = std::vector<FeedFetchType>;
-
-static const std::string kFeedOpType = "feed";
-static const std::string kFetchOpType = "fetch";
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/grad_op_desc_maker.h b/paddle/framework/grad_op_desc_maker.h
deleted file mode 100644
index 2082f8bb76fb62bc36f033fecbd4eaa76d12d949..0000000000000000000000000000000000000000
--- a/paddle/framework/grad_op_desc_maker.h
+++ /dev/null
@@ -1,190 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <unordered_set>
-#include <vector>
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-
-/*
-  This functor class is responsible for creating the gradient ops for the given
-  operator fwd_op. After it is called (through operator()), the pairs of
-  (gradient variable, corresponding input variable of fwd_op) will be added to
-  grad_to_var. If an input variable of fwd_op is contained in no_grad_set, its
-  gradient varialbe will be ignored or kEmptyVarName depending on the template
-  argument DropEmptyIG in the derived classes.
- */
-class GradOpDescMakerBase {
- public:
-  explicit GradOpDescMakerBase(
-      const OpDesc& fwd_op, const std::unordered_set<std::string>& no_grad_set,
-      std::unordered_map<std::string, std::string>* grad_to_var,
-      const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>())
-      : fwd_op_(fwd_op),
-        no_grad_set_(no_grad_set),
-        grad_to_var_(grad_to_var),
-        grad_block_(grad_block) {}
-
-  virtual ~GradOpDescMakerBase() = default;
-  virtual std::vector<std::unique_ptr<OpDesc>> operator()() const = 0;
-
- protected:
-  std::vector<std::string> InputGrad(const std::string& name,
-                                     bool drop_empty_grad = true) const {
-    std::vector<std::string> ret_val;
-    auto var_names = this->Input(name);
-    ret_val.reserve(var_names.size());
-    std::transform(var_names.begin(), var_names.end(),
-                   std::back_inserter(ret_val),
-                   [this](const std::string& fwd_var_name) -> std::string {
-                     auto g_name = GradVarName(fwd_var_name);
-                     if (no_grad_set_.count(g_name)) {
-                       return kEmptyVarName;
-                     } else {
-                       (*this->grad_to_var_)[g_name] = fwd_var_name;
-                       return g_name;
-                     }
-                   });
-    if (!drop_empty_grad) {
-      return ret_val;
-    }
-    PADDLE_ENFORCE_LE(var_names.size(), 1UL,
-                      "BUG from operator developer:"
-                      " for input argument with a list of variables, "
-                      " drop_empty_grad is not allowed because it makes"
-                      " the correspondence bewteen a variable and its gradient"
-                      " ambiguous. Use REGISTER_OP_EX to register the op"
-                      " or call InputGrad(?,false) in GradOpDescMaker."
-                      " Op type %s",
-                      fwd_op_.Type());
-
-    std::vector<std::string> dropped_ret_val;
-    dropped_ret_val.reserve(ret_val.size());
-    std::copy_if(ret_val.begin(), ret_val.end(),
-                 std::back_inserter(dropped_ret_val),
-                 [](const std::string& str) { return str != kEmptyVarName; });
-    return dropped_ret_val;
-  }
-
-  std::vector<std::string> OutputGrad(const std::string& name) const {
-    std::vector<std::string> ret_val;
-    auto onames = this->Output(name);
-    ret_val.reserve(onames.size());
-    std::transform(onames.begin(), onames.end(), std::back_inserter(ret_val),
-                   [this](const std::string& fwd_var_name) -> std::string {
-                     auto g_name = GradVarName(fwd_var_name);
-                     (*this->grad_to_var_)[g_name] = fwd_var_name;
-                     return g_name;
-                   });
-    return ret_val;
-  }
-
-  std::vector<std::string> InputNames() const {
-    return this->fwd_op_.InputNames();
-  }
-
-  std::vector<std::string> OutputNames() const {
-    return this->fwd_op_.OutputNames();
-  }
-
-  std::vector<std::string> Input(const std::string& name) const {
-    return fwd_op_.Input(name);
-  }
-
-  std::vector<std::string> Output(const std::string& name) const {
-    return fwd_op_.Output(name);
-  }
-
-  const std::unordered_map<std::string, Attribute>& Attrs() const {
-    return fwd_op_.GetAttrMap();
-  }
-
-  const Attribute& GetAttr(const std::string& name) const {
-    auto& map = fwd_op_.GetAttrMap();
-    auto it = map.find(name);
-    PADDLE_ENFORCE(it != map.end(), "Cannot find attribute %s", name);
-    return it->second;
-  }
-
-  std::string ForwardOpType() const { return this->fwd_op_.Type(); }
-
- private:
-  const OpDesc& fwd_op_;
-  const std::unordered_set<std::string>& no_grad_set_;
-  std::unordered_map<std::string, std::string>* grad_to_var_;
-
- protected:
-  std::vector<BlockDesc*> grad_block_;
-};
-
-class SingleGradOpDescMaker : public GradOpDescMakerBase {
- public:
-  using GradOpDescMakerBase::GradOpDescMakerBase;
-
-  std::vector<std::unique_ptr<OpDesc>> operator()() const {
-    std::vector<std::unique_ptr<OpDesc>> retv;
-    retv.emplace_back(this->Apply());
-    return retv;
-  }
-
- protected:
-  virtual std::unique_ptr<OpDesc> Apply() const = 0;
-};
-
-template <bool DropEmptyIG = true>
-class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
- public:
-  using SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  virtual std::unique_ptr<OpDesc> Apply() const {
-    auto* grad = new OpDesc();
-    grad->SetType(this->GradOpType());
-
-    for (auto& input_param : this->InputNames()) {
-      grad->SetInput(input_param, this->Input(input_param));
-      grad->SetOutput(GradVarName(input_param),
-                      this->InputGrad(input_param, DropEmptyIG));
-    }
-
-    for (auto& output_param : this->OutputNames()) {
-      grad->SetInput(output_param, this->Output(output_param));
-      grad->SetInput(GradVarName(output_param), this->OutputGrad(output_param));
-    }
-
-    grad->SetAttrMap(this->Attrs());
-
-    return std::unique_ptr<OpDesc>(grad);
-  }
-
-  virtual std::string GradOpType() const {
-    return this->ForwardOpType() + "_grad";
-  }
-};
-
-class EmptyGradOpMaker : public GradOpDescMakerBase {
- public:
-  using GradOpDescMakerBase::GradOpDescMakerBase;
-  std::vector<std::unique_ptr<OpDesc>> operator()() const override {
-    return {};
-  }
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc
deleted file mode 100644
index 3f6ea121b3994979d89a7d5a8c20c59240a0c111..0000000000000000000000000000000000000000
--- a/paddle/framework/init.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <string.h>  // for strdup
-#include <algorithm>
-#include <stdexcept>
-#include <string>
-
-#include "paddle/framework/init.h"
-#include "paddle/framework/operator.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/place.h"
-#include "paddle/string/piece.h"
-
-namespace paddle {
-namespace framework {
-
-std::once_flag gflags_init_flag;
-
-void InitGflags(std::vector<std::string> &argv) {
-  std::call_once(gflags_init_flag, [&]() {
-    int argc = argv.size();
-    char **arr = new char *[argv.size()];
-    std::string line;
-    for (size_t i = 0; i < argv.size(); i++) {
-      arr[i] = &argv[i][0];
-      line += argv[i];
-      line += ' ';
-    }
-    google::ParseCommandLineFlags(&argc, &arr, true);
-    VLOG(1) << "Init commandline: " << line;
-  });
-}
-
-void InitDevices() {
-  /*Init all avaiable devices by default */
-
-  std::vector<platform::Place> places;
-  places.emplace_back(platform::CPUPlace());
-  int count = 0;
-
-#ifdef PADDLE_WITH_CUDA
-  try {
-    count = platform::GetCUDADeviceCount();
-  } catch (const std::exception &exp) {
-    LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
-  }
-#else
-  LOG(WARNING)
-      << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
-#endif
-
-  for (int i = 0; i < count; ++i) {
-    places.emplace_back(platform::CUDAPlace(i));
-  }
-
-  platform::DeviceContextPool::Init(places);
-}
-
-void InitGLOG(const std::string &prog_name) {
-  // glog will not hold the ARGV[0] inside.
-  // Use strdup to alloc a new string.
-  google::InitGoogleLogging(strdup(prog_name.c_str()));
-  google::InstallFailureSignalHandler();
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/init_test.cc b/paddle/framework/init_test.cc
deleted file mode 100644
index 01e076dd8ea24831e3ed7c8a7f8fae6818a89335..0000000000000000000000000000000000000000
--- a/paddle/framework/init_test.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "gtest/gtest.h"
-
-#include "paddle/framework/init.h"
-#include "paddle/platform/device_context.h"
-
-TEST(InitDevices, CPU) {
-  using paddle::framework::InitDevices;
-  using paddle::platform::DeviceContextPool;
-
-#ifndef PADDLE_WITH_CUDA
-  InitDevices();
-  DeviceContextPool& pool = DeviceContextPool::Instance();
-  ASSERT_EQ(pool.size(), 1U);
-#endif
-}
-
-TEST(InitDevices, CUDA) {
-  using paddle::framework::InitDevices;
-  using paddle::platform::DeviceContextPool;
-
-#ifdef PADDLE_WITH_CUDA
-  int count = paddle::platform::GetCUDADeviceCount();
-  InitDevices();
-  DeviceContextPool& pool = DeviceContextPool::Instance();
-  ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
-#endif
-}
diff --git a/paddle/framework/lod_rank_table.cc b/paddle/framework/lod_rank_table.cc
deleted file mode 100644
index 704bce2a0eb60b974efd41a4edda0af2933da825..0000000000000000000000000000000000000000
--- a/paddle/framework/lod_rank_table.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/lod_rank_table.h"
-
-namespace paddle {
-namespace framework {
-void LoDRankTable::Reset(const LoD& lod, size_t level) {
-  this->coarse_lod_.clear();
-  this->items_.clear();
-  PADDLE_ENFORCE(level < lod.size(),
-                 "Cannot rank lod since the level %d is less than lod size %d",
-                 level, lod.size());
-  coarse_lod_.reserve(level);
-  for (size_t i = 0; i < level; ++i) {
-    coarse_lod_.push_back(lod[i]);
-  }
-  auto& vec = lod[level];
-  for (size_t i = 0; i < vec.size() - 1; ++i) {
-    TableItem item;
-    item.index = i;
-    item.length = vec[i + 1] - vec[i];
-    VLOG(10) << "Add item to rank table " << item.index << " " << item.length;
-    items_.emplace_back(item);
-  }
-  // NOTE(yuyang18):
-  //
-  // The time complexity of stable_sort is O(N*log(N)) if additional memory is
-  // available. It is easy to debug and unit test when using `stable_sort`
-  // instead of `sort`. Also, the items of a rank table will not be too large.
-  std::stable_sort(items_.begin(), items_.end(),
-                   [](const TableItem& a, const TableItem& b) {
-                     return a.length > b.length;
-                   });
-}
-
-}  // namespace framework
-
-std::ostream& operator<<(std::ostream& out,
-                         const framework::LoDRankTable& table) {
-  out << "NumOfSequence " << table.items().size() << "\n";
-  for (auto& each_item : table.items()) {
-    out << "\tSeq #" << each_item.index << ", Len=" << each_item.length << "\n";
-  }
-  return out;
-}
-}  // namespace paddle
diff --git a/paddle/framework/lod_rank_table.h b/paddle/framework/lod_rank_table.h
deleted file mode 100644
index df188709e91871ded0258fa5703ee16a5664f057..0000000000000000000000000000000000000000
--- a/paddle/framework/lod_rank_table.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <iosfwd>
-#include "paddle/framework/lod_tensor.h"
-
-namespace paddle {
-namespace framework {
-
-// LoD Rank Table stores the `level` of `lod` which is ordered by sequence
-// length in descending order. It is useful when implement dynamic RNN and is
-// shared by dynamic RNN memory, dynamic RNN slice input and dynamic RNN slice
-// output operators.
-//
-// The table item contains two element. The length of sequence and the index of
-// sequence in that level.
-//
-// LoDRankTable also stores the coarse_lod, which is the lod information whose
-// level is less than input level, in order to restore the output LoD
-// information.
-class LoDRankTable {
- public:
-  struct TableItem {
-    size_t index;
-    size_t length;
-  };
-
-  LoDRankTable() {}
-
-  void Reset(const LoD& lod, size_t level);
-
-  const std::vector<TableItem>& items() const { return this->items_; }
-
-  const LoD& coarse_lod() const { return this->coarse_lod_; }
-
-  size_t level() const { return coarse_lod_.size(); }
-
- private:
-  LoD coarse_lod_;
-  std::vector<TableItem> items_;
-};
-
-}  // namespace framework
-
-std::ostream& operator<<(std::ostream& out,
-                         const framework::LoDRankTable& table);
-
-}  // namespace paddle
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
deleted file mode 100644
index cb27de6991674247e6215ce64a2da5000fa78ed4..0000000000000000000000000000000000000000
--- a/paddle/framework/lod_tensor.cc
+++ /dev/null
@@ -1,378 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/framework.pb.h"
-
-#include "paddle/memory/memcpy.h"
-#include "paddle/memory/memory.h"
-
-#include <stdint.h>
-#include <string.h>
-#include <algorithm>
-#include <iterator>
-
-namespace paddle {
-namespace framework {
-
-std::ostream &operator<<(std::ostream &os, const LoD &lod) {
-  os << "{";
-  for (auto &v : lod) {
-    os << "{";
-    for (auto &i : v) {
-      os << i << ",";
-    }
-    os << "}";
-  }
-  os << "}";
-
-  return os;
-}
-
-std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
-  PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code());
-
-  if (!platform::is_cpu_place(t.place())) {
-    LoDTensor tt;
-    framework::Copy(t, platform::CPUPlace(), &tt);
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(t.place());
-    dev_ctx.Wait();
-
-    os << tt;
-    return os;
-  }
-
-  os << "dim: " << t.dims() << "\n";
-  os << "lod: " << t.lod() << "\n";
-
-  // only print first ten elements
-  int64_t size = t.numel() < 10 ? t.numel() : 10;
-  for (int64_t i = 0; i < size; ++i) {
-    os << t.data<float>()[i] << " ";
-  }
-
-  return os;
-}
-
-std::string LoDToString(const LoD &lod) {
-  std::ostringstream stream;
-  stream << lod;
-  return stream.str();
-}
-
-LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
-                 size_t elem_end) {
-  PADDLE_ENFORCE_LT(level, in.size());
-  PADDLE_ENFORCE_LT(elem_end, in[level].size());
-
-  LoD res;
-  res.resize(in.size() - level);
-  // copy the first level
-  res[0].assign(in[level].begin() + elem_begin,
-                in[level].begin() + elem_end + 1);
-  for (size_t lvl = 1; lvl < res.size(); lvl++) {
-    const auto &in_level = in[level + lvl];
-    const auto &above_level = res[lvl - 1];
-    auto &out_level = res[lvl];
-    out_level.assign(in_level.begin() + above_level.front(),
-                     in_level.begin() + above_level.back() + 1);
-  }
-  for (size_t lvl = 0; lvl < res.size(); lvl++) {
-    // to make the first offset equals 0, all the elements minus the first
-    // element
-    size_t front = res[lvl].front();
-    for (auto &ele : res[lvl]) {
-      ele -= front;
-    }
-  }
-  return res;
-}
-
-LoD ToAbsOffset(const LoD &in) {
-  // the lowest level stores relative offsets
-  if (in.empty() || in.size() == 1) return in;
-  LoD result = in;
-  for (auto level = static_cast<int>(in.size() - 2); level >= 0; level--) {
-    for (size_t i = 0; i < in[level].size(); ++i) {
-      size_t index = in[level][i];
-      result[level][i] = result[level + 1][index];
-    }
-  }
-  return result;
-}
-
-bool operator==(const LoD &a, const LoD &b) {
-  if (a.size() != b.size()) {
-    return false;
-  }
-
-  for (size_t i = 0; i < a.size(); i++) {
-    const auto &a_level = a[i];
-    const auto &b_level = b[i];
-    if (a_level.size() != b_level.size()) {
-      return false;
-    }
-    for (size_t j = 0; j < a_level.size(); j++) {
-      if (a_level[j] != b_level[j]) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-bool CheckLoD(const LoD &in, int tensor_height) {
-  if (in.empty()) return true;
-  for (const auto &level : in) {
-    // check: there should be more than 2 offsets existing in each level.
-    if (level.size() < 2) return false;
-    // check: the first offset(the begin offset) of each level should be 0.
-    if (level.front() != 0) return false;
-    // check: all the offsets in a level should be ascending(no same items
-    // allows).
-    if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
-          if (a < b) return true;
-          return false;
-        })) {
-      LOG(INFO) << "ascending error";
-      return false;
-    }
-  }
-  // check: the lowest level's last offset should equals `tensor_height` if
-  //        tensor_height>0.
-  if (tensor_height > 0 && (size_t)tensor_height != in.back().back())
-    return false;
-
-  // check: the higher level's last offset should equals the lower level's
-  // size-1.
-  // NOTE LoD store the levels from top to bottom, so the higher level goes
-  // first.
-  for (size_t level = 0; level < in.size() - 1; level++) {
-    if (in[level].back() != in[level + 1].size() - 1) return false;
-  }
-  return true;
-}
-
-bool CheckAbsLoD(const LoD &in, int tensor_height) {
-  if (in.empty()) return true;
-  for (const auto &level : in) {
-    // check: all the offsets in a level should be ascending(no same items
-    // allows).
-    if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
-          if (a < b) return true;
-          return false;
-        })) {
-      return false;
-    }
-
-    // check: there should be more than 2 offsets existing in each level.
-    if (level.size() < 2) return false;
-
-    // check: the first offset of each level should be 0, and the last should be
-    // the same(the height of underlying tensor).
-    if (level.front() != 0) return false;
-    if (tensor_height < 0) {
-      tensor_height = level.back();
-    } else if ((size_t)tensor_height != level.back()) {
-      return false;
-    }
-  }
-  return true;
-}
-
-using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
-LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
-                                        size_t end_idx, size_t start_level) {
-  LoD sub_lod;
-
-  for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
-    PADDLE_ENFORCE_LE(start_idx, end_idx);
-    PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size());
-    std::vector<size_t> level_lens;
-    for (size_t i = start_idx; i < end_idx; ++i) {
-      level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
-    }
-    sub_lod.emplace_back(level_lens);
-    start_idx = lod[level_idx][start_idx];
-    end_idx = lod[level_idx][end_idx];
-  }
-
-  return LoDAndOffset{sub_lod, {start_idx, end_idx}};
-}
-
-void AppendLoD(LoD *lod, const LoD &lod_length) {
-  PADDLE_ENFORCE(
-      lod->empty() || lod->size() == lod_length.size(),
-      "The lod_length should has the same size with the appended lod.");
-  if (lod->empty()) {
-    for (size_t i = 0; i < lod_length.size(); ++i) {
-      lod->emplace_back(1, 0);  // size = 1, value = 0;
-    }
-    *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
-  }
-  for (size_t i = 0; i < lod->size(); ++i) {
-    auto &level = (*lod)[i];
-    for (size_t len : lod_length[i]) {
-      level.push_back(level.back() + len);
-    }
-  }
-}
-
-void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
-                       const platform::DeviceContext &dev_ctx) {
-  {  // the 1st field, uint32_t version for LoDTensor
-    constexpr uint32_t version = 0;
-    os.write(reinterpret_cast<const char *>(&version), sizeof(version));
-  }
-  {
-    // the 2st field, LoD information
-    // uint64_t lod_level
-    // uint64_t lod_level_1 size in byte.
-    // int*     lod_level_1 data
-    // ...
-    auto lod = tensor.lod();
-    uint64_t size = lod.size();
-    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-
-    for (auto &each : lod) {
-      size = each.size() * sizeof(framework::LoD::value_type::value_type);
-      os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-      os.write(reinterpret_cast<const char *>(each.data()),
-               static_cast<std::streamsize>(size));
-    }
-  }
-  // the 3st field, Tensor
-  SerializeToStream(os, static_cast<Tensor>(tensor), dev_ctx);
-}
-
-void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
-                           const platform::DeviceContext &dev_ctx) {
-  {
-    // the 1st field, unit32_t version for LoDTensor
-    uint32_t version;
-    is.read(reinterpret_cast<char *>(&version), sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-  }
-  {
-    // the 2st field, LoD information
-    uint64_t lod_level;
-    is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
-    auto &lod = *tensor->mutable_lod();
-    lod.resize(lod_level);
-    for (uint64_t i = 0; i < lod_level; ++i) {
-      uint64_t size;
-      is.read(reinterpret_cast<char *>(&size), sizeof(size));
-      std::vector<size_t> tmp(size / sizeof(size_t));
-      is.read(reinterpret_cast<char *>(tmp.data()),
-              static_cast<std::streamsize>(size));
-      lod[i] = tmp;
-    }
-  }
-  // the 3st filed, Tensor
-  DeserializeFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
-}
-
-std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
-    const std::vector<platform::Place> places) const {
-  check_memory_size();
-  int batch_size =
-      lod().empty() ? dims()[0] : static_cast<int>(lod()[0].size()) - 1;
-  size_t result_size = std::min(static_cast<size_t>(batch_size), places.size());
-  size_t remainder = batch_size % places.size();
-
-  std::vector<LoDTensor> results;
-  results.reserve(result_size);
-
-  int step_width = static_cast<int>(batch_size / result_size);
-  for (size_t i = 0; i < result_size; ++i) {
-    int begin = static_cast<int>(i * step_width);
-    int end = static_cast<int>((i + 1) * step_width);
-    if (i + 1 == places.size()) {  // last
-      end += remainder;
-    }
-
-    LoDTensor dst;
-    if (lod().empty()) {
-      auto src = Slice(begin, end);
-      auto &dst_place = places[i];
-      framework::Copy(src, dst_place, &dst);
-    } else {
-      auto lod_and_offset = GetSubLoDAndAbsoluteOffset(lod(), begin, end, 0);
-
-      auto &offset = lod_and_offset.second;
-      auto src = Slice(offset.first, offset.second);
-      auto &dst_place = places[i];
-      framework::Copy(src, dst_place, &dst);
-
-      LoD my_lod;
-      for (auto &l : lod_and_offset.first) {
-        std::vector<size_t> v{0};
-        for (auto &ll : l) {
-          v.push_back(ll + v.back());
-        }
-        my_lod.emplace_back(v);
-      }
-      dst.set_lod(my_lod);
-    }
-    results.emplace_back(dst);
-  }
-
-  return results;
-}
-
-void LoDTensor::MergeLoDTensor(
-    const std::vector<const LoDTensor *> &lod_tensors,
-    platform::Place dst_place) {
-  PADDLE_ENFORCE(!lod_tensors.empty());
-
-  framework::DDim new_dim = lod_tensors[0]->dims();
-  std::type_index new_type = lod_tensors[0]->type();
-  framework::DataLayout new_layout = lod_tensors[0]->layout();
-  LoD new_lod = lod_tensors[0]->lod();
-  for (size_t i = 1; i < lod_tensors.size(); ++i) {
-    auto *t = lod_tensors[i];
-    PADDLE_ENFORCE_EQ(new_type.hash_code(), t->type().hash_code());
-    PADDLE_ENFORCE_EQ(new_layout, t->layout());
-
-    PADDLE_ENFORCE_EQ(framework::product(new_dim) / new_dim[0],
-                      framework::product(t->dims()) / t->dims()[0]);
-    new_dim[0] += t->dims()[0];
-
-    auto &lod = t->lod();
-    for (size_t j = 0; j < lod.size(); ++j) {
-      auto &sub_lod = new_lod[j];
-      auto &offset = sub_lod.back();
-      for (size_t k = 1; k < lod[j].size(); ++k) {
-        sub_lod.push_back(lod[j][k] + offset);
-      }
-    }
-  }
-  Resize(new_dim);
-  set_layout(new_layout);
-  set_lod(new_lod);
-  mutable_data(dst_place, new_type);
-
-  int begin = 0;
-  for (auto *src : lod_tensors) {
-    int end = begin + src->dims()[0];
-    auto dst = Slice(begin, end);
-    framework::Copy(*src, dst_place, &dst);
-    begin = end;
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
deleted file mode 100644
index be2b301619639106ac7b578e5a79cf33f4379e48..0000000000000000000000000000000000000000
--- a/paddle/framework/lod_tensor.h
+++ /dev/null
@@ -1,235 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#ifdef PADDLE_WITH_CUDA
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#endif
-
-#include <glog/logging.h>
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/mixed_vector.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/framework/tensor_util.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/place.h"
-
-namespace paddle {
-namespace framework {
-
-/*
- * LoD is short for Level of Details.
- *
- * - in a level, each element indicates relative offset of the lower level
- * - the first element should be 0 and that indicates that this sequence start
- * from 0
- * - each sequence's begin and end(no-inclusive) is level[id, id+1]
- *
- * For example:
- *    3-level LoD stores
- *
- *    0 2 3
- *    0 2 4 7
- *    0 2 5 7 10 12 15 20
- */
-struct LoD : public std::vector<Vector<size_t>> {
-  using std::vector<Vector<size_t>>::vector;
-  platform::Place place() const {
-    if (this->size() == 0) {
-      // Not Initialze Yet.
-      return platform::CPUPlace();
-    } else {
-      return this->front().place();
-    }
-  }
-
-  void CopyFromCUDA() {
-    for (auto it = this->begin(); it != this->end(); ++it) {
-      it->CopyFromCUDA();
-    }
-  }
-
-  void CopyToPeer(platform::Place place) {
-    for (auto it = this->begin(); it != this->end(); ++it) {
-      it->CopyToPeer(place);
-    }
-  }
-};
-
-std::ostream& operator<<(std::ostream& os, const LoD& lod);
-std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
-
-std::string LoDToString(const LoD& lod);
-
-LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
-                 size_t elem_end);
-/*
- * Transform an LoD from relative offsets to absolute offsets.
- */
-LoD ToAbsOffset(const LoD& in);
-
-bool operator==(const LoD& a, const LoD& b);
-
-/*
- * Check whether this lod's format is valid.
- *
- * ATTENTION:
- *   - Empty lod is treated as valid.
- *
- * It will check two things:
- *
- *  1. all the offsets in a level should be ascending(no same items allows).
- *  2. there should be more than 2 offsets existing in each level.
- *  3. the higher level's last offset should equals the lower level's size-1.
- *  4. the first offset(the begin offset) of each level should be 0.
- *  5. the lowest level's last offset should equals `tensor_height` if
- * tensor_height>0.
- */
-
-bool CheckLoD(const LoD& in, int tensor_height = -1);
-/*
- * Check whether this absolute lod's format is valid.
- *
- * ATTENTION:
- *   - Empty lod is treated as valid.
- *
- * It will check two things:
- *  1. all the offsets in a level should be ascending(no same items allows)
- *  2. there should be more than 2 offsets existing in each level.
- *  3. the first offset of each level should be 0, and the last should be the
- *     same(the height of underlying tensor) or `tensor_height` if
- *     tensor_height>0.
- */
-bool CheckAbsLoD(const LoD& in, int tensor_height = -1);
-
-/*
- * LoDTensor (Level of details Tensor)
- * see https://en.wikipedia.org/wiki/Level_of_details for reference.
- */
-class LoDTensor : public Tensor {
- public:
-  LoDTensor() : Tensor() {}
-
-  /* Constructor with place should only be used in pybind */
-  explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
-
-  explicit LoDTensor(const LoD& lod) : lod_(lod) {}
-
-  void set_lod(const LoD& lod) { lod_ = lod; }
-
-  const LoD& lod() const { return lod_; }
-
-  LoD* mutable_lod() { return &lod_; }
-
-  /*
-   * Get the start offset and end offset of an  element from LoD.
-   */
-  std::pair<size_t, size_t> lod_element(size_t level, size_t elem) const {
-    PADDLE_ENFORCE_LT(level, NumLevels());
-    PADDLE_ENFORCE_LT(elem, NumElements(level));
-    return std::make_pair((lod_)[level][elem], (lod_)[level][elem + 1]);
-  }
-
-  /*
-   * Number of LoDTensor's levels, each level has units of data, for example,
-   * in the sentence's view, article, paragraph, sentence are 3 levels.
-   */
-  size_t NumLevels() const { return lod_.size(); }
-  /*
-   * Number of elements in a level.
-   */
-  size_t NumElements(size_t level = 0) const {
-    PADDLE_ENFORCE_LT(level, NumLevels());
-    // the last offset is the end of last element
-    return (lod_)[level].size() - 1;
-  }
-
-  std::vector<LoDTensor> SplitLoDTensor(
-      const std::vector<platform::Place> places) const;
-
-  void MergeLoDTensor(const std::vector<const LoDTensor*>& lod_tensors,
-                      platform::Place place);
-
- private:
-  LoD lod_;
-};
-
-/*
- * Expand the `source` to fit the LoD of `lod`. For example, a `source`
- * LoDTensor is
- *  - LoD: [0, 2]
- *  - tensor: [a0, a1]
- * a `lod` is
- *  - LoD: [0 3 5]
- * returns a new LoDTensor
- *  - [a0 a0 a0 a1 a1]
- */
-template <typename T>
-LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
-                    const platform::Place& place) {
-  LoD abs_lod = ToAbsOffset(lod);
-  const auto& lod_level = lod[level];
-  size_t num_instances = source.dims()[0];
-
-  // new tensor
-  LoDTensor tensor;
-  tensor.set_lod(lod);
-  auto dims = source.dims();
-  dims[0] = lod_level.back();
-  tensor.Resize(dims);
-  tensor.mutable_data<T>(place);
-
-  PADDLE_ENFORCE_EQ(num_instances, lod_level.size() - 1);
-  for (size_t ins = 0; ins < num_instances; ins++) {
-    for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) {
-      auto slice = tensor.Slice(elem, elem + 1);
-      Copy(source.Slice(ins, ins + 1), platform::CPUPlace(),
-           platform::CPUDeviceContext(), &slice);
-    }
-  }
-  return tensor;
-}
-
-// Get the absolute offset of a lod[start_level][start_idx:end_idx] and
-// relative length of details for every levels(i.e., [start_level: ]).
-//
-// For example,
-//   lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]]
-//   start_level = 0
-//   start_idx = 1
-//   end_idx = 3
-//
-// Returns:
-//  LoD = [[1, 4], [2, 4, 2, 3, 2]]
-//  pair<size_t, size_t> = {11, 24}
-std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
-    const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level);
-
-void AppendLoD(LoD* lod, const LoD& lod_length);
-
-/*
- * Serialize/Desiralize LoDTensor to std::ostream
- * You can pass ofstream or ostringstream to serilize to file
- * or to a in memory string. GPU tensor will be copied to CPU.
- */
-void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
-                       const platform::DeviceContext& dev_ctx);
-void DeserializeFromStream(std::istream& is, LoDTensor* tensor,
-                           const platform::DeviceContext& dev_ctx);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/lod_tensor_array.h b/paddle/framework/lod_tensor_array.h
deleted file mode 100644
index 4a8e7f4fa540b1c2f19a6e3ec236a0dd5c0daf0b..0000000000000000000000000000000000000000
--- a/paddle/framework/lod_tensor_array.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/framework/lod_tensor.h"
-
-namespace paddle {
-namespace framework {
-using LoDTensorArray = std::vector<LoDTensor>;
-}
-}  // namespace paddle
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
deleted file mode 100644
index 3b63020e685436396071fa05cd7697630ae56c95..0000000000000000000000000000000000000000
--- a/paddle/framework/lod_tensor_test.cc
+++ /dev/null
@@ -1,228 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/framework/lod_tensor.h"
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <memory>
-#include <vector>
-
-namespace paddle {
-namespace framework {
-
-TEST(LoD, data) {
-  LoD lod{{0, 1, 2}};
-  lod.push_back({0, 2, 4, 5});
-  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
-
-  auto& v = lod[0];
-  for (size_t i = 0; i < v.size(); ++i) {
-    EXPECT_EQ(v[i], i);
-  }
-}
-
-TEST(LodExpand, test) {
-  LoD lod{{0, 2}};
-  LoDTensor tensor;
-  tensor.set_lod(lod);
-  tensor.Resize({2, 1});
-  tensor.mutable_data<float>(platform::CPUPlace());
-  tensor.data<float>()[0] = 0;
-  tensor.data<float>()[1] = 1;
-
-  LoD target;
-  target.emplace_back(std::vector<size_t>{0, 3, 5});
-  auto new_tensor = LodExpand<float>(tensor, target, 0UL, platform::CPUPlace());
-  std::vector<int> result{{0, 0, 0, 1, 1}};
-  for (size_t i = 0; i < 5; i++) {
-    ASSERT_EQ(new_tensor.data<float>()[i], result[i]);
-  }
-}
-
-TEST(LoD, GetFineGrainedLoDLength) {
-  LoD lod;
-  lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
-  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
-  lod.push_back(
-      std::vector<size_t>({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29}));
-
-  auto lod_and_offset =
-      paddle::framework::GetSubLoDAndAbsoluteOffset(lod, 1, 2, 0);
-  LoD lod_length = lod_and_offset.first;
-  size_t start_offset = lod_and_offset.second.first;
-  size_t end_offset = lod_and_offset.second.second;
-
-  LoD expected;
-  expected.push_back(std::vector<size_t>{2});
-  expected.push_back(std::vector<size_t>{2, 2});
-  expected.push_back(std::vector<size_t>{2, 3, 4, 2});
-  EXPECT_EQ(lod_length, expected);
-  EXPECT_EQ(start_offset, 15UL);
-  EXPECT_EQ(end_offset, 26UL);
-}
-
-TEST(LoD, AppendLoD) {
-  LoD lod_lens;
-  lod_lens.push_back(std::vector<size_t>({2}));
-  lod_lens.push_back(std::vector<size_t>({2, 2}));
-  lod_lens.push_back(std::vector<size_t>({2, 3, 4, 2}));
-
-  LoD origin;
-  origin.push_back(std::vector<size_t>({0, 2}));
-  origin.push_back(std::vector<size_t>({0, 1, 6}));
-  origin.push_back(std::vector<size_t>({0, 2, 5, 7, 10, 12, 15}));
-
-  paddle::framework::AppendLoD(&origin, lod_lens);
-
-  LoD expected;
-  expected.push_back(std::vector<size_t>({0, 2, 4}));
-  expected.push_back(std::vector<size_t>({0, 1, 6, 8, 10}));
-  expected.push_back(
-      std::vector<size_t>({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26}));
-  EXPECT_EQ(origin, expected);
-}
-
-TEST(LoD, ToAbsOffset) {
-  LoD relative_lod;
-  relative_lod.push_back(std::vector<size_t>({0, 2}));
-  relative_lod.push_back(std::vector<size_t>({0, 1, 3}));
-  relative_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
-
-  LoD abs_lod = paddle::framework::ToAbsOffset(relative_lod);
-
-  LoD expected;
-  expected.push_back(std::vector<size_t>({0, 5}));
-  expected.push_back(std::vector<size_t>({0, 2, 5}));
-  expected.push_back(std::vector<size_t>({0, 2, 4, 5}));
-
-  EXPECT_EQ(abs_lod, expected);
-}
-
-TEST(LoD, SplitLoDTensor) {
-  LoD lod;
-  lod.push_back(std::vector<size_t>({0, 2, 4, 5, 6}));
-  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 13, 15, 20}));
-
-  platform::CPUPlace place;
-  LoDTensor lod_tensor;
-  lod_tensor.Resize({20, 1});
-  float* dst_ptr = lod_tensor.mutable_data<float>(place);
-  for (int i = 0; i < lod_tensor.numel(); ++i) {
-    dst_ptr[i] = i;
-  }
-  lod_tensor.set_lod(lod);
-
-  std::vector<platform::Place> places{platform::CPUPlace(),
-                                      platform::CPUPlace()};
-  LoD lod0;
-  lod0.push_back(std::vector<size_t>({0, 2, 4}));
-  lod0.push_back(std::vector<size_t>({0, 1, 6, 8, 13}));
-  LoD lod1;
-  lod1.push_back(std::vector<size_t>({0, 1, 2}));
-  lod1.push_back(std::vector<size_t>({0, 2, 7}));
-
-  auto lods = lod_tensor.SplitLoDTensor(places);
-  EXPECT_EQ(lods[0].lod(), lod0);
-  EXPECT_EQ(lods[1].lod(), lod1);
-}
-
-TEST(LoD, MergeLoDTensor) {
-  LoD lod;
-  lod.push_back(std::vector<size_t>({0, 2, 4, 5, 6}));
-  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 13, 15, 20}));
-
-  platform::CPUPlace place;
-
-  LoDTensor lod_tensor0;
-  LoD lod0;
-  lod0.push_back(std::vector<size_t>({0, 2, 4}));
-  lod0.push_back(std::vector<size_t>({0, 1, 6, 8, 13}));
-  lod_tensor0.set_lod(lod0);
-
-  lod_tensor0.Resize({13, 1});
-  float* dst_ptr = lod_tensor0.mutable_data<float>(place);
-  for (int i = 0; i < lod_tensor0.numel(); ++i) {
-    dst_ptr[i] = i;
-  }
-
-  LoDTensor lod_tensor1;
-  LoD lod1;
-  lod1.push_back(std::vector<size_t>({0, 1, 2}));
-  lod1.push_back(std::vector<size_t>({0, 2, 7}));
-  lod_tensor1.set_lod(lod1);
-  lod_tensor1.Resize({7, 1});
-  dst_ptr = lod_tensor1.mutable_data<float>(place);
-  for (int i = 0; i < lod_tensor1.numel(); ++i) {
-    dst_ptr[i] = i;
-  }
-
-  std::vector<const LoDTensor*> lods{&lod_tensor0, &lod_tensor1};
-
-  LoDTensor lod_tensor;
-  lod_tensor.MergeLoDTensor(lods, place);
-  EXPECT_EQ(lod_tensor.lod(), lod);
-}
-
-TEST(LoD, CheckLoD) {
-  LoD relative_lod;
-  relative_lod.push_back(std::vector<size_t>({0, 2}));
-  relative_lod.push_back(std::vector<size_t>({0, 1, 3}));
-  relative_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
-
-  // check compatible
-  ASSERT_TRUE(CheckLoD(relative_lod));
-  relative_lod[1].back()++;
-  ASSERT_FALSE(CheckLoD(relative_lod));
-  relative_lod[1].back()--;  // recover it
-
-  // check empty
-  LoD empty_lod;
-  ASSERT_TRUE(CheckLoD(empty_lod));
-
-  // check less than 2 offsets in a level
-  LoD some_lod0;
-  some_lod0.push_back(std::vector<size_t>({0}));
-  ASSERT_FALSE(CheckLoD(some_lod0));
-
-  // check with underlying tensor storage.
-  ASSERT_TRUE(CheckLoD(relative_lod, 5));
-  ASSERT_FALSE(CheckLoD(relative_lod, 9));
-}
-
-TEST(LoD, CheckAbsLoD) {
-  LoD relative_lod;
-  relative_lod.push_back(std::vector<size_t>({0, 2}));
-  relative_lod.push_back(std::vector<size_t>({0, 1, 3}));
-  relative_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
-
-  auto abs_lod = ToAbsOffset(relative_lod);
-
-  ASSERT_TRUE(CheckAbsLoD(abs_lod));
-
-  // check less than 2 offsets in a level.
-
-  // check the last item should be compatible with tensor height.
-  abs_lod.back().back()++;
-  ASSERT_FALSE(CheckAbsLoD(abs_lod));
-  abs_lod.back().back()--;  // restore
-
-  // check less than 2 offsets in a lod.
-  LoD abs_lod0;
-  abs_lod0.push_back(std::vector<size_t>({0}));
-  ASSERT_FALSE(CheckAbsLoD(abs_lod0));
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/lod_tensor_test.cu b/paddle/framework/lod_tensor_test.cu
deleted file mode 100644
index adea02e3b3fdcf4873de76ff91116f43ac9fe259..0000000000000000000000000000000000000000
--- a/paddle/framework/lod_tensor_test.cu
+++ /dev/null
@@ -1,73 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <stdio.h>
-#include "paddle/framework/init.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/platform/assert.h"
-
-#include <gtest/gtest.h>
-
-__global__ void test(size_t* a, int size) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
-       i += blockDim.x * gridDim.x) {
-    a[i] *= 2;
-  }
-}
-
-TEST(LoD, data) {
-  paddle::framework::InitDevices();
-
-  paddle::framework::LoD lod{{0, 1, 2}};
-  lod.push_back({0, 2, 4, 5});
-  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
-
-  auto& v = lod[0];
-  test<<<1, 1>>>(v.cuda_data(), v.size());
-  cudaDeviceSynchronize();
-
-  v.CopyFromCUDA();
-  for (size_t i = 0; i < v.size(); ++i) {
-    EXPECT_EQ(v[i], i * 2);
-  }
-}
-
-TEST(LoDTensor, LoDInGPU) {
-  paddle::framework::InitDevices();
-
-  paddle::framework::LoDTensor lod_tensor;
-  paddle::platform::CUDAPlace place(0);
-
-  paddle::framework::LoD src_lod;
-  src_lod.push_back(std::vector<size_t>{0, 2, 4, 6, 8, 10, 12, 14});
-
-  lod_tensor.Resize({14, 16});
-  lod_tensor.mutable_data<float>(place);
-
-  lod_tensor.set_lod(src_lod);
-  EXPECT_EQ(lod_tensor.lod_element(0, 2).first, 4UL);
-  EXPECT_EQ(lod_tensor.lod_element(0, 4).first, 8UL);
-
-  auto lod = lod_tensor.lod();
-
-  test<<<1, 8>>>(lod[0].cuda_data(), lod[0].size());
-  cudaDeviceSynchronize();
-  lod.CopyFromCUDA();
-
-  for (size_t i = 0; i < src_lod[0].size(); ++i) {
-    EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
-  }
-}
diff --git a/paddle/framework/mixed_vector.h b/paddle/framework/mixed_vector.h
deleted file mode 100644
index 5202775515d335ff81bb17e6ce21338c40041ca3..0000000000000000000000000000000000000000
--- a/paddle/framework/mixed_vector.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#include <initializer_list>
-#include <vector>
-
-#include "paddle/memory/memcpy.h"
-#include "paddle/memory/memory.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/place.h"
-
-namespace paddle {
-namespace framework {
-
-/**
- * @brief Vector support both cpu and gpu.
- * host vector lifetime is same with Vector
- * device vector is lazily malloc and modified.
- */
-
-template <typename T>
-class Vector : public std::vector<T> {
- public:
-  using std::vector<T>::vector;
-
-  Vector() {}
-  Vector(const std::vector<T> &v) : std::vector<T>(v) {}  // NOLINT
-
-  inline platform::Place place() const { return place_; }
-
-  /*! Return a pointer to constant memory block. */
-  inline const T *data(platform::Place place) const;
-
-  /*! Return a pointer to mutable memory block. */
-  inline T *mutable_data(platform::Place place);
-
-  // TODO(dzhwinter): below interfaces should be removed
-  /* Get device vector */
-  T *cuda_data() {
-    CopyToCUDA();
-    PADDLE_ENFORCE_NOT_NULL(
-        cuda_ptr_, "No data or Insufficient CUDA memory to allocation");
-    return static_cast<T *>(cuda_ptr_.get());
-  }
-
-  /* Get host vector */
-  T *data() { return std::vector<T>::data(); }
-  const T *data() const { return std::vector<T>::data(); }
-
-  T *data(const platform::Place &place) {
-    if (platform::is_cpu_place(place)) {
-      return data();
-    } else {
-      return cuda_data();
-    }
-  }
-
-  /* Synchronize host vector to device vector */
-  void CopyToCUDA();
-  /* Synchronize device vector to host vector */
-  void CopyFromCUDA();
-  /* Switch device vector location */
-  void CopyToPeer(platform::Place);
-
- private:
-  std::shared_ptr<void> cuda_ptr_;
-  size_t cuda_size_ = 0;  // device vector numel
-  platform::CUDAPlace place_;
-};
-
-template <typename T>
-inline const T *Vector<T>::data(platform::Place place) const {
-  if (platform::is_cpu_place(place)) {
-    return std::vector<T>::data();
-  } else if (platform::is_gpu_place(place)) {
-    if (cuda_ptr_ == nullptr) {
-      return nullptr;
-    }
-    if (boost::get<platform::CUDAPlace>(place) == place_) {
-      return static_cast<const T *>(cuda_ptr_.get());
-    } else {
-      PADDLE_THROW(
-          "Unmatched place. Please use `mutable_data` copy lod to the target "
-          "Place first.");
-    }
-  } else {
-    PADDLE_THROW("Unsupport Place.");
-  }
-}
-
-template <typename T>
-inline T *Vector<T>::mutable_data(platform::Place place) {
-  if (platform::is_cpu_place(place)) {
-    return std::vector<T>::data();
-  } else if (platform::is_gpu_place(place)) {
-    if (boost::get<platform::CUDAPlace>(place) != place_) {
-      place_ = boost::get<platform::CUDAPlace>(place);
-    }
-#ifdef PADDLE_WITH_CUDA
-    if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
-      cuda_ptr_.reset(
-          memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
-          memory::PlainDeleter<void, platform::CUDAPlace>(place_));
-    }
-    cuda_size_ = this->size();
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto *ctx = pool.GetByPlace(place_);
-    memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
-                 static_cast<const void *>(this->data()),
-                 this->size() * sizeof(T), ctx->stream());
-    ctx->Wait();
-    return static_cast<T *>(cuda_ptr_.get());
-#else
-    return nullptr;
-#endif
-  } else {
-    PADDLE_THROW("Unsupport Place.");
-  }
-}
-
-template <typename T>
-void Vector<T>::CopyToCUDA() {
-#ifdef PADDLE_WITH_CUDA
-  if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
-    cuda_ptr_.reset(
-        memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
-        memory::PlainDeleter<void, platform::CUDAPlace>(place_));
-  }
-  cuda_size_ = this->size();
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto *ctx = pool.GetByPlace(place_);
-  memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
-               static_cast<const void *>(this->data()),
-               this->size() * sizeof(T), ctx->stream());
-  ctx->Wait();
-#endif
-}
-
-template <typename T>
-void Vector<T>::CopyFromCUDA() {
-#ifdef PADDLE_WITH_CUDA
-  if (cuda_ptr_ == nullptr) {
-    LOG(WARNING) << "No uncommitted cuda data.";
-    return;
-  }
-  this->resize(cuda_size_);
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto *ctx = pool.GetByPlace(place_);
-  memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
-               static_cast<const void *>(cuda_ptr_.get()),
-               this->size() * sizeof(T), ctx->stream());
-  ctx->Wait();
-#endif
-}
-
-template <typename T>
-void Vector<T>::CopyToPeer(platform::Place place) {
-#ifdef PADDLE_WITH_CUDA
-  if (boost::get<platform::CUDAPlace>(place) != place_) {
-    place_ = boost::get<platform::CUDAPlace>(place);
-  }
-  if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
-    cuda_ptr_.reset(
-        memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
-        memory::PlainDeleter<void, platform::CUDAPlace>(place_));
-  }
-  cuda_size_ = this->size();
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto *ctx = pool.GetByPlace(place_);
-  memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
-               static_cast<const void *>(this->data()),
-               this->size() * sizeof(T), ctx->stream());
-  ctx->Wait();
-#endif
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/mixed_vector_test.cu b/paddle/framework/mixed_vector_test.cu
deleted file mode 100644
index 7b571788ad1ade50e05dc9a70cba35b83f8db3ea..0000000000000000000000000000000000000000
--- a/paddle/framework/mixed_vector_test.cu
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include "gtest/gtest.h"
-
-#include "paddle/framework/init.h"
-#include "paddle/framework/mixed_vector.h"
-
-using namespace paddle::framework;
-using namespace paddle::platform;
-using namespace paddle::memory;
-
-template <typename T>
-__global__ void test(T* data, int size) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
-       i += blockDim.x * gridDim.x) {
-    data[i] *= 2;
-  }
-}
-
-TEST(Vector, Normal) {
-  // fill the device context pool.
-  InitDevices();
-
-  Vector<size_t> vec({1, 2, 3});
-  size_t* ptr = vec.data();
-  for (size_t i = 0; i < vec.size(); ++i) {
-    EXPECT_EQ(vec[i], *(ptr + i));
-  }
-
-  vec.clear();
-  vec.CopyFromCUDA();
-
-  std::vector<size_t> v = {1, 2, 3};
-  for (size_t i = 0; i < v.size(); ++i) {
-    EXPECT_EQ(v[i], vec[i]);
-  }
-}
-
-TEST(Vector, MultipleCopy) {
-  InitDevices();
-  Vector<size_t> vec({1, 2, 3});
-  CUDAPlace place(0);
-  vec.mutable_data(place);
-  auto vec2 = Vector<size_t>(vec);
-  {
-    const size_t* ptr = vec2.data(CPUPlace());
-    for (size_t i = 0; i < vec2.size(); ++i) {
-      EXPECT_EQ(*(ptr + i), vec[i]);
-    }
-  }
-  test<size_t><<<3, 3>>>(vec2.mutable_data(place), vec2.size());
-  vec2.CopyFromCUDA();
-  {
-    const size_t* ptr = vec2.data(CPUPlace());
-    for (size_t i = 0; i < vec2.size(); ++i) {
-      EXPECT_EQ(*(ptr + i), vec[i] * 2);
-    }
-  }
-}
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
deleted file mode 100644
index b51afe499bbc0e6b727aeeb4334f56e400ea81a5..0000000000000000000000000000000000000000
--- a/paddle/framework/op_desc.cc
+++ /dev/null
@@ -1,514 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/op_desc.h"
-#include <functional>
-#include <mutex>
-#include <unordered_map>
-#include "glog/logging.h"
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/program_desc.h"
-#include "paddle/framework/shape_inference.h"
-
-namespace paddle {
-namespace framework {
-
-class OpDesc;
-class BlockDesc;
-class CompileTimeInferShapeContext : public InferShapeContext {
- public:
-  CompileTimeInferShapeContext(const OpDesc &op, const BlockDesc &block);
-
-  bool HasInput(const std::string &name) const override;
-
-  bool HasOutput(const std::string &name) const override;
-
-  bool HasInputs(const std::string &name) const override;
-
-  bool HasOutputs(const std::string &name) const override;
-
-  AttrReader Attrs() const override;
-
-  const std::vector<std::string> &Inputs(
-      const std::string &name) const override;
-
-  const std::vector<std::string> &Outputs(
-      const std::string &name) const override;
-
-  void ShareLoD(const std::string &in, const std::string &out, size_t i = 0,
-                size_t j = 0) const override {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
-    auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
-    auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
-    if (in_var->GetType() != proto::VarDesc::LOD_TENSOR) {
-      VLOG(3) << "input " << in << " is not LodTensor";
-      return;
-    }
-    PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarDesc::LOD_TENSOR,
-                      "The %d-th output of Output(%s) must be LoDTensor.", j,
-                      out);
-    out_var->SetLoDLevel(in_var->GetLoDLevel());
-  }
-
-  bool IsRuntime() const override;
-
- protected:
-  proto::VarDesc::VarType GetVarType(const std::string &name) const override;
-
-  DDim GetDim(const std::string &name) const override;
-
-  void SetDim(const std::string &name, const DDim &dim) override;
-
-  std::vector<DDim> GetRepeatedDims(const std::string &name) const override;
-
-  void SetRepeatedDims(const std::string &name,
-                       const std::vector<DDim> &dims) override;
-
-  const OpDesc &op_;
-  const BlockDesc &block_;
-};
-
-OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
-               const VariableNameMap &outputs, const AttributeMap &attrs) {
-  desc_.set_type(type);
-  inputs_ = inputs;
-  outputs_ = outputs;
-  attrs_ = attrs;
-  need_update_ = true;
-}
-
-void OpDesc::CopyFrom(const OpDesc &op_desc) {
-  desc_.set_type(op_desc.Type());
-  inputs_ = op_desc.inputs_;
-  outputs_ = op_desc.outputs_;
-  attrs_ = op_desc.attrs_;
-  need_update_ = true;
-}
-
-OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
-    : desc_(desc), need_update_(false) {
-  // restore inputs_
-  int input_size = desc_.inputs_size();
-  for (int i = 0; i < input_size; ++i) {
-    const proto::OpDesc::Var &var = desc_.inputs(i);
-    std::vector<std::string> &args = inputs_[var.parameter()];
-    int argu_size = var.arguments_size();
-    args.reserve(argu_size);
-    for (int j = 0; j < argu_size; ++j) {
-      args.push_back(var.arguments(j));
-    }
-  }
-  // restore outputs_
-  int output_size = desc_.outputs_size();
-  for (int i = 0; i < output_size; ++i) {
-    const proto::OpDesc::Var &var = desc_.outputs(i);
-    std::vector<std::string> &args = outputs_[var.parameter()];
-    int argu_size = var.arguments_size();
-    args.reserve(argu_size);
-    for (int j = 0; j < argu_size; ++j) {
-      args.push_back(var.arguments(j));
-    }
-  }
-  // restore attrs_
-  for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
-    std::string attr_name = attr.name();
-    // The sub_block referred to by the BLOCK attr hasn't been added
-    // to ProgramDesc class yet, we skip setting BLOCK attr here.
-    if (attr.type() != proto::AttrType::BLOCK) {
-      attrs_[attr_name] = GetAttrValue(attr);
-    }
-  }
-  this->block_ = block;
-}
-
-proto::OpDesc *OpDesc::Proto() {
-  Flush();
-  return &desc_;
-}
-
-const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
-  auto it = inputs_.find(name);
-  PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name,
-                 Type());
-  return it->second;
-}
-
-std::vector<std::string> OpDesc::InputArgumentNames() const {
-  std::vector<std::string> retv;
-  for (auto &ipt : this->inputs_) {
-    retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
-  }
-  return retv;
-}
-
-void OpDesc::SetInput(const std::string &param_name,
-                      const std::vector<std::string> &args) {
-  need_update_ = true;
-  inputs_[param_name] = args;
-}
-
-const std::vector<std::string> &OpDesc::Output(const std::string &name) const {
-  auto it = outputs_.find(name);
-  PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s",
-                 name, Type());
-  return it->second;
-}
-
-std::vector<std::string> OpDesc::OutputArgumentNames() const {
-  std::vector<std::string> retv;
-  for (auto &ipt : this->outputs_) {
-    retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
-  }
-  return retv;
-}
-
-void OpDesc::SetOutput(const std::string &param_name,
-                       const std::vector<std::string> &args) {
-  need_update_ = true;
-  this->outputs_[param_name] = args;
-}
-
-proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
-  auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-  return static_cast<proto::AttrType>(it->second.which() - 1);
-}
-
-std::vector<std::string> OpDesc::AttrNames() const {
-  std::vector<std::string> retv;
-  retv.reserve(attrs_.size());
-  for (auto &attr : attrs_) {
-    retv.push_back(attr.first);
-  }
-  return retv;
-}
-
-void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
-  this->attrs_[name] = v;
-  need_update_ = true;
-}
-
-void OpDesc::SetBlockAttr(const std::string &name, BlockDesc &block) {
-  this->attrs_[name] = &block;
-  need_update_ = true;
-}
-
-void OpDesc::SetAttrMap(
-    const std::unordered_map<std::string, Attribute> &attr_map) {
-  attrs_ = attr_map;
-  need_update_ = true;
-}
-
-Attribute OpDesc::GetAttr(const std::string &name) const {
-  auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-  return it->second;
-}
-
-int OpDesc::GetBlockAttr(const std::string &name) const {
-  auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-  return boost::get<BlockDesc *>(it->second)->ID();
-}
-
-const std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() const {
-  return attrs_;
-}
-
-void OpDesc::Rename(const std::string &old_name, const std::string &new_name) {
-  for (auto &input : inputs_) {
-    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
-  }
-  for (auto &output : outputs_) {
-    std::replace(output.second.begin(), output.second.end(), old_name,
-                 new_name);
-  }
-  need_update_ = true;
-}
-
-void OpDesc::RenameOutput(const std::string &old_name,
-                          const std::string &new_name) {
-  for (auto &output : outputs_) {
-    std::replace(output.second.begin(), output.second.end(), old_name,
-                 new_name);
-  }
-  need_update_ = true;
-}
-
-void OpDesc::RenameInput(const std::string &old_name,
-                         const std::string &new_name) {
-  for (auto &input : inputs_) {
-    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
-  }
-  need_update_ = true;
-}
-
-struct SetAttrDescVisitor : public boost::static_visitor<void> {
-  explicit SetAttrDescVisitor(proto::OpDesc::Attr *attr) : attr_(attr) {}
-  mutable proto::OpDesc::Attr *attr_;
-  void operator()(int v) const { attr_->set_i(v); }
-  void operator()(float v) const { attr_->set_f(v); }
-  void operator()(const std::string &v) const { attr_->set_s(v); }
-
-  // Please refer to https://github.com/PaddlePaddle/Paddle/issues/7162
-  template <class T,
-            class = typename std::enable_if<std::is_same<bool, T>::value>::type>
-  void operator()(T b) const {
-    attr_->set_b(b);
-  }
-
-  void operator()(const std::vector<int> &v) const {
-    VectorToRepeated(v, attr_->mutable_ints());
-  }
-  void operator()(const std::vector<float> &v) const {
-    VectorToRepeated(v, attr_->mutable_floats());
-  }
-  void operator()(const std::vector<std::string> &v) const {
-    VectorToRepeated(v, attr_->mutable_strings());
-  }
-  void operator()(const std::vector<bool> &v) const {
-    VectorToRepeated(v, attr_->mutable_bools());
-  }
-  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
-  void operator()(int64_t v) const { attr_->set_l(v); }
-  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
-};
-
-void OpDesc::Flush() {
-  if (need_update_) {
-    this->desc_.mutable_inputs()->Clear();
-    for (auto &ipt : inputs_) {
-      auto *input = desc_.add_inputs();
-      input->set_parameter(ipt.first);
-      VectorToRepeated(ipt.second, input->mutable_arguments());
-    }
-
-    this->desc_.mutable_outputs()->Clear();
-    for (auto &opt : outputs_) {
-      auto *output = desc_.add_outputs();
-      output->set_parameter(opt.first);
-      VectorToRepeated(opt.second, output->mutable_arguments());
-    }
-
-    this->desc_.mutable_attrs()->Clear();
-    for (auto &attr : attrs_) {
-      auto *attr_desc = desc_.add_attrs();
-      attr_desc->set_name(attr.first);
-      attr_desc->set_type(
-          static_cast<proto::AttrType>(attr.second.which() - 1));
-      SetAttrDescVisitor visitor(attr_desc);
-      boost::apply_visitor(visitor, attr.second);
-    }
-
-    need_update_ = false;
-  }
-}
-
-static std::once_flag init_infer_shape_funcs;
-
-static void InitInferShapeFuncs() {
-  std::call_once(init_infer_shape_funcs, [] {
-    auto &map = OpInfoMap::Instance();
-    auto &info_map = *map.mutable_map();
-
-    for (auto &kern_pair : OperatorWithKernel::AllOpKernels()) {
-      auto op_type = kern_pair.first;
-      auto &op_info = info_map.at(op_type);
-      auto op = static_cast<OperatorWithKernel *>(op_info.Creator()(
-          "", VariableNameMap{}, VariableNameMap{}, AttributeMap{}));
-      if (op_info.infer_shape_) {  // infer_shape has been registered.
-        continue;
-      }
-      op_info.infer_shape_ = [op](InferShapeContext *ctx) {
-        op->InferShape(ctx);
-      };
-    }
-  });
-}
-
-void OpDesc::CheckAttrs() {
-  PADDLE_ENFORCE(!Type().empty(),
-                 "CheckAttr() can not be called before type is setted.");
-  auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
-  if (checker == nullptr) {
-    // checker is not configured. That operator could be generated by Paddle,
-    // not by users.
-    return;
-  }
-  checker->Check(attrs_);
-}
-
-void OpDesc::InferShape(const BlockDesc &block) const {
-  VLOG(3) << "CompileTime infer shape on " << Type();
-  InitInferShapeFuncs();
-  auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
-  PADDLE_ENFORCE(static_cast<bool>(infer_shape),
-                 "%s's infer_shape has not been registered", this->Type());
-  CompileTimeInferShapeContext ctx(*this, block);
-  if (VLOG_IS_ON(10)) {
-    std::ostringstream sout;
-    auto inames = this->InputArgumentNames();
-    sout << " From [";
-    std::copy(inames.begin(), inames.end(),
-              std::ostream_iterator<std::string>(sout, ", "));
-    sout << "] to [";
-    auto onames = this->OutputArgumentNames();
-    std::copy(onames.begin(), onames.end(),
-              std::ostream_iterator<std::string>(sout, ", "));
-    sout << "]";
-    VLOG(10) << sout.str();
-  }
-  infer_shape(&ctx);
-}
-
-void OpDesc::InferVarType(BlockDesc *block) const {
-  auto &info = OpInfoMap::Instance().Get(this->Type());
-  if (info.infer_var_type_) {
-    info.infer_var_type_(*this, block);
-  } else {
-    // all output type is LoDTensor by default
-    VLOG(10) << this->Type()
-             << " has not registered InferVarType. Set output variables to "
-                "LOD_TENSOR";
-    for (auto &out_pair : this->outputs_) {
-      for (auto &out_var_name : out_pair.second) {
-        block->FindRecursiveOrCreateVar(out_var_name)
-            .SetType(proto::VarDesc::LOD_TENSOR);
-      }
-    }
-  }
-}
-
-CompileTimeInferShapeContext::CompileTimeInferShapeContext(
-    const OpDesc &op, const BlockDesc &block)
-    : op_(op), block_(block) {}
-
-bool CompileTimeInferShapeContext::HasInput(const std::string &name) const {
-  const std::vector<std::string> &input_names = op_.Input(name);
-  auto length = input_names.size();
-  if (length == 0) {
-    return false;
-  }
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Input(%s) should have only one value, "
-                    "but it have %d now",
-                    name, length);
-  return block_.HasVarRecursive(input_names[0]);
-}
-
-bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const {
-  const std::vector<std::string> &output_names = op_.Output(name);
-  auto length = output_names.size();
-  if (length == 0) {
-    return false;
-  }
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Output(%s) should have only one value, "
-                    "but it have %d now",
-                    name, length);
-  return block_.HasVarRecursive(output_names[0]);
-}
-
-bool CompileTimeInferShapeContext::HasInputs(const std::string &name) const {
-  const std::vector<std::string> &input_names = op_.Input(name);
-  if (input_names.empty()) {
-    return false;
-  }
-  for (auto &input : input_names) {
-    if (!block_.HasVarRecursive(input)) return false;
-  }
-  return true;
-}
-
-bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const {
-  const std::vector<std::string> &output_names = op_.Output(name);
-  if (output_names.empty()) {
-    return false;
-  }
-  for (auto &output : output_names) {
-    if (!block_.HasVarRecursive(output)) return false;
-  }
-  return true;
-}
-
-AttrReader CompileTimeInferShapeContext::Attrs() const {
-  return AttrReader(op_.GetAttrMap());
-}
-
-const std::vector<std::string> &CompileTimeInferShapeContext::Inputs(
-    const std::string &name) const {
-  return op_.Input(name);
-}
-
-const std::vector<std::string> &CompileTimeInferShapeContext::Outputs(
-    const std::string &name) const {
-  return op_.Output(name);
-}
-
-DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
-  auto var = block_.FindVarRecursive(name);
-  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
-  DDim res;
-  try {
-    auto shape = var->GetShape();
-    res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape);
-  } catch (...) {
-    VLOG(5) << "GetDim of variable " << name << " error";
-    std::rethrow_exception(std::current_exception());
-  }
-  return res;
-}
-
-std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
-    const std::string &name) const {
-  auto var = block_.FindVarRecursive(name);
-  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
-  std::vector<DDim> res;
-  try {
-    auto shapes = var->GetShapes();
-    for (const auto &s : shapes) {
-      res.push_back(s.empty() ? make_ddim({0UL}) : make_ddim(s));
-    }
-  } catch (...) {
-    VLOG(5) << "GetRepeatedDim of variable " << name << " error.";
-    std::rethrow_exception(std::current_exception());
-  }
-  return res;
-}
-
-void CompileTimeInferShapeContext::SetDim(const std::string &name,
-                                          const DDim &dim) {
-  block_.FindVarRecursive(name)->SetShape(vectorize(dim));
-}
-
-void CompileTimeInferShapeContext::SetRepeatedDims(
-    const std::string &name, const std::vector<DDim> &dims) {
-  auto var = block_.FindVarRecursive(name);
-  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
-  std::vector<std::vector<int64_t>> dim_vec(dims.size());
-  std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize);
-  var->SetShapes(dim_vec);
-}
-
-bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
-
-proto::VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
-    const std::string &name) const {
-  return block_.FindVarRecursive(name)->GetType();
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
deleted file mode 100644
index 13695cff59f0bfd79c48eb28670ecc67a0309332..0000000000000000000000000000000000000000
--- a/paddle/framework/op_desc.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <unordered_map>
-#include <vector>
-#include "paddle/framework/attribute.h"
-#include "paddle/framework/type_defs.h"
-#include "paddle/framework/var_desc.h"
-
-namespace paddle {
-namespace framework {
-
-class BlockDesc;
-class ProgramDesc;
-class OpDesc {
- public:
-  OpDesc() {}
-
-  OpDesc(const std::string &type, const VariableNameMap &inputs,
-         const VariableNameMap &outputs, const AttributeMap &attrs);
-
-  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block);
-
-  explicit OpDesc(BlockDesc *block) : block_(block) {}
-
-  OpDesc(const OpDesc &other, BlockDesc *block) {
-    *this = other;
-    block_ = block;
-  }
-
-  void CopyFrom(const OpDesc &op_desc);
-
-  proto::OpDesc *Proto();
-
-  std::string Type() const { return desc_.type(); }
-
-  void SetType(const std::string &type) { desc_.set_type(type); }
-
-  const std::vector<std::string> &Input(const std::string &name) const;
-
-  std::vector<std::string> InputArgumentNames() const;
-
-  void SetInput(const std::string &param_name,
-                const std::vector<std::string> &args);
-
-  const std::vector<std::string> &Output(const std::string &name) const;
-
-  std::vector<std::string> OutputArgumentNames() const;
-
-  void SetOutput(const std::string &param_name,
-                 const std::vector<std::string> &args);
-
-  bool HasAttr(const std::string &name) const {
-    return attrs_.find(name) != attrs_.end();
-  }
-
-  proto::AttrType GetAttrType(const std::string &name) const;
-
-  std::vector<std::string> AttrNames() const;
-
-  void SetAttr(const std::string &name, const Attribute &v);
-
-  void SetBlockAttr(const std::string &name, BlockDesc &block);
-
-  Attribute GetAttr(const std::string &name) const;
-
-  int GetBlockAttr(const std::string &name) const;
-
-  void Rename(const std::string &old_name, const std::string &new_name);
-
-  void RenameOutput(const std::string &old_name, const std::string &new_name);
-
-  void RenameInput(const std::string &old_name, const std::string &new_name);
-
-  // Only be used in C++
-  const AttributeMap &GetAttrMap() const;
-
-  // Only be used in C++
-  void SetAttrMap(const AttributeMap &attr_map);
-
-  std::vector<std::string> InputNames() const { return MapKeys(inputs_); }
-  std::vector<std::string> OutputNames() const { return MapKeys(outputs_); }
-
-  void SetInputMap(const VariableNameMap &input) {
-    this->inputs_ = input;
-    this->need_update_ = true;
-  }
-
-  void SetOutputMap(const VariableNameMap &output) {
-    this->outputs_ = output;
-    this->need_update_ = true;
-  }
-
-  const VariableNameMap &Inputs() const { return inputs_; }
-
-  const VariableNameMap &Outputs() const { return outputs_; }
-
-  AttributeMap *MutableAttrMap() {
-    this->need_update_ = true;
-    return &this->attrs_;
-  }
-
-  void CheckAttrs();
-
-  void InferShape(const BlockDesc &block) const;
-
-  void InferVarType(BlockDesc *block) const;
-
-  void MarkAsTarget() { desc_.set_is_target(true); }
-
-  void Flush();
-
-  BlockDesc *Block() { return this->block_; }
-
-  void SetBlock(BlockDesc *block) { this->block_ = block; }
-
- private:
-  template <typename MapType>
-  static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {
-    std::vector<typename MapType::key_type> ret_val;
-    ret_val.reserve(map.size());
-    std::transform(
-        map.begin(), map.end(), std::back_inserter(ret_val),
-        [](const typename MapType::value_type &pair) { return pair.first; });
-    return ret_val;
-  }
-
-  proto::OpDesc desc_;
-  BlockDesc *block_;  // not_own
-  // input arg name => input variable names
-  VariableNameMap inputs_;
-  // output arg name => output variable names
-  VariableNameMap outputs_;
-  AttributeMap attrs_;
-
-  // need_update_ indicate there some local changes not be synchronized. If
-  // local changes should be synchronized, need_update_ should be set to true.
-  bool need_update_{false};
-};
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/op_info.cc b/paddle/framework/op_info.cc
deleted file mode 100644
index b520108109bb2f72b80f83559fa065a5ca58e9e1..0000000000000000000000000000000000000000
--- a/paddle/framework/op_info.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/op_info.h"
-
-namespace paddle {
-namespace framework {
-
-static OpInfoMap* g_op_info_map = nullptr;
-
-OpInfoMap& OpInfoMap::Instance() {
-  if (g_op_info_map == nullptr) {
-    g_op_info_map = new OpInfoMap();
-  }
-  return *g_op_info_map;
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/op_info.h b/paddle/framework/op_info.h
deleted file mode 100644
index d9b89f9cac9611fcecb18bef87940632df1e2234..0000000000000000000000000000000000000000
--- a/paddle/framework/op_info.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-#include <map>
-#include <string>
-#include <unordered_map>
-
-#include "paddle/framework/attribute.h"
-#include "paddle/framework/type_defs.h"
-#include "paddle/platform/macros.h"
-
-namespace paddle {
-namespace framework {
-
-class InferShapeBase {
- public:
-  virtual ~InferShapeBase() = default;
-  virtual void operator()(InferShapeContext*) const = 0;
-};
-
-struct OpInfo {
-  OpCreator creator_;
-  GradOpMakerFN grad_op_maker_;
-  proto::OpProto* proto_{nullptr};
-  OpAttrChecker* checker_{nullptr};
-  InferVarTypeFN infer_var_type_;
-  InferShapeFN infer_shape_;
-
-  bool HasOpProtoAndChecker() const {
-    return proto_ != nullptr && checker_ != nullptr;
-  }
-
-  const proto::OpProto& Proto() const {
-    PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
-    PADDLE_ENFORCE(proto_->IsInitialized(),
-                   "Operator Proto must be initialized in op info");
-    return *proto_;
-  }
-
-  const OpCreator& Creator() const {
-    PADDLE_ENFORCE_NOT_NULL(creator_,
-                            "Operator Creator has not been registered");
-    return creator_;
-  }
-
-  const GradOpMakerFN& GradOpMaker() const {
-    PADDLE_ENFORCE_NOT_NULL(grad_op_maker_,
-                            "Operator GradOpMaker has not been registered.");
-    return grad_op_maker_;
-  }
-
-  const OpAttrChecker* Checker() const { return checker_; }
-};
-
-class OpInfoMap {
- public:
-  static OpInfoMap& Instance();
-
-  bool Has(const std::string& op_type) const {
-    return map_.find(op_type) != map_.end();
-  }
-
-  void Insert(const std::string& type, const OpInfo& info) {
-    PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
-    map_.insert({type, info});
-  }
-
-  const OpInfo& Get(const std::string& type) const {
-    auto op_info_ptr = GetNullable(type);
-    PADDLE_ENFORCE_NOT_NULL(op_info_ptr, "Operator %s has not been registered",
-                            type);
-    return *op_info_ptr;
-  }
-
-  const OpInfo* GetNullable(const std::string& type) const {
-    auto it = map_.find(type);
-    if (it == map_.end()) {
-      return nullptr;
-    } else {
-      return &it->second;
-    }
-  }
-
-  const std::unordered_map<std::string, OpInfo>& map() const { return map_; }
-
-  std::unordered_map<std::string, OpInfo>* mutable_map() { return &map_; }
-
- private:
-  OpInfoMap() = default;
-  std::unordered_map<std::string, OpInfo> map_;
-
-  DISABLE_COPY_AND_ASSIGN(OpInfoMap);
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/op_kernel_type.h b/paddle/framework/op_kernel_type.h
deleted file mode 100644
index 44adb94d2a8feb79a5ff93c6e32cdff52333166e..0000000000000000000000000000000000000000
--- a/paddle/framework/op_kernel_type.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/data_layout.h"
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/library_type.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/place.h"
-
-namespace paddle {
-namespace framework {
-
-struct OpKernelType {
-  struct Hash {
-    size_t operator()(const OpKernelType& key) const {
-      int place = key.place_.which();
-      int data_type = static_cast<int>(key.data_type_) << LEFT_SHIFT;
-      int data_layout = static_cast<int>(key.data_layout_) << (LEFT_SHIFT * 2);
-      int library_type = static_cast<int>(key.library_type_)
-                         << (LEFT_SHIFT * 3);
-
-      std::hash<int> hasher;
-      return hasher(place + data_type + data_layout + library_type);
-    }
-  };
-
-  // place, data_type, library_type kinds less than 2^8
-  constexpr static int LEFT_SHIFT = 8;
-
-  proto::DataType data_type_;
-  DataLayout data_layout_;
-  platform::Place place_;
-  LibraryType library_type_;
-
-  OpKernelType(proto::DataType data_type, platform::Place place,
-               DataLayout data_layout = DataLayout::kAnyLayout,
-               LibraryType library_type = LibraryType::kPlain)
-      : data_type_(data_type),
-        data_layout_(data_layout),
-        place_(place),
-        library_type_(library_type) {}
-
-  OpKernelType(proto::DataType data_type,
-               const platform::DeviceContext& dev_ctx,
-               DataLayout data_layout = DataLayout::kAnyLayout,
-               LibraryType library_type = LibraryType::kPlain)
-      : data_type_(data_type),
-        data_layout_(data_layout),
-        place_(dev_ctx.GetPlace()),
-        library_type_(library_type) {}
-
-  bool operator==(const OpKernelType& o) const {
-    return platform::places_are_same_class(place_, o.place_) &&
-           data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
-           library_type_ == o.library_type_;
-  }
-
-  bool operator!=(const OpKernelType& o) const { return !(*this == o); }
-};
-
-inline std::ostream& operator<<(std::ostream& os,
-                                const OpKernelType& kernel_key) {
-  os << "data_type[" << kernel_key.data_type_ << "]:data_layout["
-     << kernel_key.data_layout_ << "]:place[" << kernel_key.place_
-     << "]:library_type[" << kernel_key.library_type_ << "]";
-  return os;
-}
-
-inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
-  std::ostringstream stream;
-  stream << kernel_key;
-  return stream.str();
-}
-
-inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
-  return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r;
-}
-
-inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) {
-  return (!platform::places_are_same_class(l.place_, r.place_)) ||
-         (l.data_type_ != r.data_type_) ||
-         NeedTransformLayout(l.data_layout_, r.data_layout_);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/op_kernel_type_test.cc b/paddle/framework/op_kernel_type_test.cc
deleted file mode 100644
index cb23bbde01493d1a3b5845e77d6160a75f409c7a..0000000000000000000000000000000000000000
--- a/paddle/framework/op_kernel_type_test.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/op_kernel_type.h"
-#include <gtest/gtest.h>
-#include <iostream>
-
-TEST(OpKernelType, ToString) {
-  using OpKernelType = paddle::framework::OpKernelType;
-  using DataType = paddle::framework::proto::DataType;
-  using CPUPlace = paddle::platform::CPUPlace;
-  using DataLayout = paddle::framework::DataLayout;
-  using LibraryType = paddle::framework::LibraryType;
-
-  OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
-                              LibraryType::kCUDNN);
-
-  ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
-            "data_type[float32]:data_layout[NCHW]:place[CPUPlace]:library_type["
-            "CUDNN]");
-}
-
-TEST(OpKernelType, Hash) {
-  using OpKernelType = paddle::framework::OpKernelType;
-  using DataType = paddle::framework::proto::DataType;
-  using CPUPlace = paddle::platform::CPUPlace;
-  using CUDAPlace = paddle::platform::CUDAPlace;
-  using DataLayout = paddle::framework::DataLayout;
-  using LibraryType = paddle::framework::LibraryType;
-
-  OpKernelType op_kernel_type_1(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
-                                LibraryType::kCUDNN);
-  OpKernelType op_kernel_type_2(DataType::FP32, CUDAPlace(0), DataLayout::kNCHW,
-                                LibraryType::kCUDNN);
-
-  OpKernelType::Hash hasher;
-  ASSERT_NE(hasher(op_kernel_type_1), hasher(op_kernel_type_2));
-}
diff --git a/paddle/framework/op_proto_maker.cc b/paddle/framework/op_proto_maker.cc
deleted file mode 100644
index 151d61d5b175535509306d028027c7bc19abce81..0000000000000000000000000000000000000000
--- a/paddle/framework/op_proto_maker.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/op_proto_maker.h"
-
-namespace paddle {
-namespace framework {
-
-void OpProtoAndCheckerMaker::Validate() {
-  validated_ = true;
-  CheckNoDuplicatedInOutAttrs();
-}
-
-OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput(
-    const std::string& name, const std::string& comment) {
-  auto* input = proto_->add_inputs();
-  input->set_name(name);
-  input->set_comment(comment);
-  return OpProtoAndCheckerMaker::VariableBuilder{input};
-}
-
-OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
-    const std::string& name, const std::string& comment) {
-  auto* output = proto_->add_outputs();
-  output->set_name(name);
-  output->set_comment(comment);
-  return OpProtoAndCheckerMaker::VariableBuilder{output};
-}
-
-void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
-  std::unordered_set<std::string> names;
-  auto checker = [&](const std::string& name) {
-    PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name);
-    names.insert(name);
-  };
-  for (auto& attr : proto_->attrs()) {
-    checker(attr.name());
-  }
-  for (auto& input : proto_->inputs()) {
-    checker(input.name());
-  }
-  for (auto& output : proto_->outputs()) {
-    checker(output.name());
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/op_proto_maker.h b/paddle/framework/op_proto_maker.h
deleted file mode 100644
index efd3a5ca535403d8d46a73adc899d914623b53e4..0000000000000000000000000000000000000000
--- a/paddle/framework/op_proto_maker.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/attribute.h"
-#include "paddle/framework/framework.pb.h"
-
-namespace paddle {
-namespace framework {
-
-// this class not only make proto but also init attribute checkers.
-class OpProtoAndCheckerMaker {
- public:
-  using OpProto = proto::OpProto;
-  using OpAttrChecker = framework::OpAttrChecker;
-  OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : proto_(proto), op_checker_(op_checker) {}
-
-  virtual ~OpProtoAndCheckerMaker() {
-    PADDLE_ENFORCE(validated_, "should call Validate after build");
-  }
-
-  void Validate();
-
- protected:
-  struct VariableBuilder {
-    OpProto::Var* var_;
-
-    VariableBuilder& AsDuplicable() {
-      var_->set_duplicable(true);
-      return *this;
-    }
-
-    VariableBuilder& AsIntermediate() {
-      var_->set_intermediate(true);
-      return *this;
-    }
-
-    VariableBuilder& AsDispensable() {
-      var_->set_dispensable(true);
-      return *this;
-    }
-  };
-
-  VariableBuilder AddInput(const std::string& name, const std::string& comment);
-
-  VariableBuilder AddOutput(const std::string& name,
-                            const std::string& comment);
-
-  template <typename T>
-  TypedAttrChecker<T>& AddAttr(const std::string& name,
-                               const std::string& comment,
-                               bool generated = false) {
-    auto* attr = proto_->add_attrs();
-    attr->set_name(name);
-    attr->set_comment(comment);
-    attr->set_generated(generated);
-    attr->set_type(AttrTypeID<T>());
-    return op_checker_->AddAttrChecker<T>(name);
-  }
-
-  void AddComment(const std::string& comment) { proto_->set_comment(comment); }
-
- private:
-  void CheckNoDuplicatedInOutAttrs();
-
-  OpProto* proto_;
-  OpAttrChecker* op_checker_;
-  bool validated_{false};
-};
-
-class NOPMaker : public OpProtoAndCheckerMaker {
- public:
-  NOPMaker(OpProto* proto, framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {}
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/op_proto_maker_test.cc b/paddle/framework/op_proto_maker_test.cc
deleted file mode 100644
index f16cb6fa3aa095a6d9737d84c7ce58f385a7072b..0000000000000000000000000000000000000000
--- a/paddle/framework/op_proto_maker_test.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/op_proto_maker.h"
-
-#include "gtest/gtest.h"
-
-class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
- public:
-  TestAttrProtoMaker(paddle::framework::proto::OpProto* proto,
-                     paddle::framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<float>("scale", "scale of test op");
-    AddAttr<float>("scale", "scale of test op");
-  }
-};
-
-TEST(ProtoMaker, DuplicatedAttr) {
-  paddle::framework::proto::OpProto op_proto;
-  paddle::framework::OpAttrChecker op_checker;
-  auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
-  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
-}
-
-class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
- public:
-  TestInOutProtoMaker(paddle::framework::proto::OpProto* proto,
-                      paddle::framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of test op");
-    AddInput("input", "input of test op");
-  }
-};
-
-TEST(ProtoMaker, DuplicatedInOut) {
-  paddle::framework::proto::OpProto op_proto;
-  paddle::framework::OpAttrChecker op_checker;
-  auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
-  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
-}
diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc
deleted file mode 100644
index dfa151316daeccfe92e26818165a694b78b5df62..0000000000000000000000000000000000000000
--- a/paddle/framework/op_registry.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/framework/op_registry.h>
-
-#include <vector>
-
-namespace paddle {
-namespace framework {
-
-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
-    const std::string& type, const VariableNameMap& inputs,
-    const VariableNameMap& outputs, AttributeMap attrs) {
-  auto& info = OpInfoMap::Instance().Get(type);
-  if (info.Checker() != nullptr) {
-    info.Checker()->Check(attrs);
-  }
-  auto op = info.Creator()(type, inputs, outputs, attrs);
-  return std::unique_ptr<OperatorBase>(op);
-}
-
-static VariableNameMap ConvertOpDescVarsToVarNameMap(
-    const google::protobuf::RepeatedPtrField<proto::OpDesc::Var>&
-        op_desc_vars) {
-  VariableNameMap ret_val;
-  for (auto& var : op_desc_vars) {
-    auto& var_names = ret_val[var.parameter()];
-    auto& var_names_in_proto = var.arguments();
-    var_names.reserve(static_cast<size_t>(var_names_in_proto.size()));
-    std::copy(var_names_in_proto.begin(), var_names_in_proto.end(),
-              std::back_inserter(var_names));
-  }
-  return ret_val;
-}
-
-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
-    const proto::OpDesc& op_desc) {
-  VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be"
-             "used in unit tests. Use CreateOp(const OpDesc& op_desc) "
-             "instead.";
-  VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
-  VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
-  AttributeMap attrs;
-  for (auto& attr : op_desc.attrs()) {
-    attrs[attr.name()] = GetAttrValue(attr);
-  }
-
-  return CreateOp(op_desc.type(), inputs, outputs, attrs);
-}
-
-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
-  return CreateOp(op_desc.Type(), op_desc.Inputs(), op_desc.Outputs(),
-                  op_desc.GetAttrMap());
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
deleted file mode 100644
index 5de9ae559c435439f30931c7840e54e0d2bb744c..0000000000000000000000000000000000000000
--- a/paddle/framework/op_registry.h
+++ /dev/null
@@ -1,246 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <atomic>
-#include <type_traits>
-#include <typeinfo>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "glog/logging.h"  // For VLOG()
-#include "paddle/framework/attribute.h"
-#include "paddle/framework/details/op_registry.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/grad_op_desc_maker.h"
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/scope.h"
-#include "paddle/framework/shape_inference.h"
-
-namespace paddle {
-namespace framework {
-class Registrar {
- public:
-  // In our design, various kinds of classes, e.g., operators and kernels,
-  // have their corresponding registry and registrar. The action of
-  // registration is in the constructor of a global registrar variable, which
-  // are not used in the code that calls package framework, and would
-  // be removed from the generated binary file by the linker. To avoid such
-  // removal, we add Touch to all registrar classes and make USE_OP macros to
-  // call this method. So, as long as the callee code calls USE_OP, the global
-  // registrar variable won't be removed by the linker.
-  void Touch() {}
-};
-
-template <typename... ARGS>
-struct OperatorRegistrar : public Registrar {
-  explicit OperatorRegistrar(const char* op_type) {
-    PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
-                   "'%s' is registered more than once.", op_type);
-    static_assert(sizeof...(ARGS) != 0,
-                  "OperatorRegistrar should be invoked at least by OpClass");
-    OpInfo info;
-    details::OperatorRegistrarRecursive<0, false, ARGS...>(op_type, &info);
-    OpInfoMap::Instance().Insert(op_type, info);
-  }
-};
-
-class OpRegistry {
- public:
-  static std::unique_ptr<OperatorBase> CreateOp(const std::string& type,
-                                                const VariableNameMap& inputs,
-                                                const VariableNameMap& outputs,
-                                                AttributeMap attrs);
-
-  static std::unique_ptr<OperatorBase> CreateOp(const proto::OpDesc& op_desc);
-
-  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
-};
-
-template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
-struct OpKernelRegistrarFunctor;
-
-template <typename PlaceType, size_t I, typename... KernelTypes>
-struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
-  using KERNEL_TYPE =
-      typename std::tuple_element<I, std::tuple<KernelTypes...>>::type;
-
-  void operator()(const char* op_type, const char* library_type) const {
-    using T = typename KERNEL_TYPE::ELEMENT_TYPE;
-    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
-                     DataLayout::kAnyLayout, StringToLibraryType(library_type));
-    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
-
-    constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
-    OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
-        func;
-    func(op_type, library_type);
-  }
-};
-
-template <typename PlaceType, size_t I, typename... KernelType>
-struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
-  void operator()(const char* op_type, const char* library_type) const {}
-};
-
-// User can register many kernel in one place. The data type could be different.
-template <typename PlaceType, typename... KernelType>
-class OpKernelRegistrar : public Registrar {
- public:
-  explicit OpKernelRegistrar(const char* op_type, const char* library_type) {
-    OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func;
-    func(op_type, library_type);
-  }
-};
-
-/**
- * check if MACRO is used in GLOBAL NAMESPACE.
- */
-#define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                        \
-  struct __test_global_namespace_##uniq_name##__ {};                          \
-  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
-                             __test_global_namespace_##uniq_name##__>::value, \
-                msg)
-
-/*
-  The variadic arguments should be class types derived from one of the
-  following classes:
-    OpProtoAndCheckerMaker
-    GradOpDescMakerBase
-    VarTypeInference
-    InferShapeBase
-*/
-#define REGISTER_OPERATOR(op_type, op_class, ...)                      \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
-      __reg_op__##op_type,                                             \
-      "REGISTER_OPERATOR must be called in global namespace");         \
-  class _OpClass_##op_type##_ : public op_class {                      \
-   public:                                                             \
-    DEFINE_OP_CLONE_METHOD(_OpClass_##op_type##_);                     \
-    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class);            \
-  };                                                                   \
-  static ::paddle::framework::OperatorRegistrar<_OpClass_##op_type##_, \
-                                                ##__VA_ARGS__>         \
-      __op_registrar_##op_type##__(#op_type);                          \
-  int TouchOpRegistrar_##op_type() {                                   \
-    __op_registrar_##op_type##__.Touch();                              \
-    return 0;                                                          \
-  }
-
-/**
- * Macro to register Operator. When the input is duplicable, you should
- * use REGISTER_OP_EX with deop_empty_grad=false instead.
- */
-#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
-                    grad_op_class)                                   \
-  REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,    \
-                 grad_op_class, true)
-
-// When an argument is duplicable, we need to use this version.
-// Perhaps we can omit DropEmptyIG template parameter and
-// only have one version of REGISTER_OP.
-#define REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,       \
-                       grad_op_class, drop_empty_grad)                        \
-  REGISTER_OPERATOR(grad_op_type, grad_op_class);                             \
-  class _GradOpDescMaker_##grad_op_type##_                                    \
-      : public ::paddle::framework::DefaultGradOpDescMaker<drop_empty_grad> { \
-    using ::paddle::framework::DefaultGradOpDescMaker<                        \
-        drop_empty_grad>::DefaultGradOpDescMaker;                             \
-                                                                              \
-   protected:                                                                 \
-    virtual std::string GradOpType() const { return #grad_op_type; }          \
-  };                                                                          \
-  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_,    \
-                    op_maker_class);
-
-#define REGISTER_OP_WITH_KERNEL(op_type, ...)                         \
-  REGISTER_OPERATOR(op_type, ::paddle::framework::OperatorWithKernel, \
-                    ##__VA_ARGS__)
-
-#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
-  REGISTER_OPERATOR(op_type, op_class, op_maker_class)
-
-/**
- * Macro to register OperatorKernel.
- */
-#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...)        \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-      __reg_op_kernel_##op_type##_##LIBRARY_TYPE##__,                      \
-      "REGISTER_OP_KERNEL must be called in global namespace");            \
-  static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__>  \
-      __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type,       \
-                                                           #LIBRARY_TYPE); \
-  int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() {                \
-    __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch();          \
-    return 0;                                                              \
-  }
-
-#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
-  REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
-
-#define REGISTER_OP_CPU_KERNEL(op_type, ...) \
-  REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
-
-/**
- * Macro to mark what Operator and Kernel
- * we will use and tell the compiler to
- * link them into target.
- */
-#define USE_OP_ITSELF(op_type)                                    \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                 \
-      __use_op_itself_##op_type,                                  \
-      "USE_OP_ITSELF must be called in global namespace");        \
-  extern int TouchOpRegistrar_##op_type();                        \
-  static int use_op_itself_##op_type##_ __attribute__((unused)) = \
-      TouchOpRegistrar_##op_type()
-
-#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE)               \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                 \
-      __use_op_kernel_##op_type##_##LIBRARY_TYPE##__,             \
-      "USE_OP_DEVICE_KERNEL must be in global namespace");        \
-  extern int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE(); \
-  static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_          \
-      __attribute__((unused)) =                                   \
-          TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE()
-
-// TODO(fengjiayi): The following macros
-// seems ugly, do we have better method?
-
-#ifndef PADDLE_WITH_CUDA
-#define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
-#else
-#define USE_OP_KERNEL(op_type)        \
-  USE_OP_DEVICE_KERNEL(op_type, CPU); \
-  USE_OP_DEVICE_KERNEL(op_type, CUDA)
-#endif
-
-#define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type);
-
-#define USE_CPU_ONLY_OP(op_type) \
-  USE_OP_ITSELF(op_type);        \
-  USE_OP_DEVICE_KERNEL(op_type, CPU);
-
-#define USE_CUDA_ONLY_OP(op_type) \
-  USE_OP_ITSELF(op_type);         \
-  USE_OP_DEVICE_KERNEL(op_type, CUDA)
-
-#define USE_OP(op_type)   \
-  USE_OP_ITSELF(op_type); \
-  USE_OP_KERNEL(op_type)
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
deleted file mode 100644
index 341da8befd45abd1a3fc86581be33319a8791567..0000000000000000000000000000000000000000
--- a/paddle/framework/op_registry_test.cc
+++ /dev/null
@@ -1,373 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include "paddle/framework/op_registry.h"
-
-namespace pd = paddle::framework;
-
-namespace paddle {
-namespace framework {
-
-class CosineOp : public OperatorBase {
- public:
-  using OperatorBase::OperatorBase;
-  void Run(const Scope& scope, const platform::Place& place) const override {}
-};
-
-class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  CosineOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op");
-    AddOutput("output", "output of cosine op");
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .GreaterThan(0.0);
-    AddComment("This is cos op");
-  }
-};
-
-class MyTestOp : public OperatorBase {
- public:
-  using OperatorBase::OperatorBase;
-  void Run(const Scope& scope, const platform::Place& place) const override {}
-};
-
-class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op").AsDuplicable();
-    AddOutput("output", "output of cosine op").AsIntermediate();
-    auto my_checker = [](int i) {
-      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
-    };
-    AddAttr<int>("test_attr", "a simple test attribute")
-        .AddCustomChecker(my_checker);
-    AddComment("This is my_test op");
-  }
-};
-}  // namespace framework
-}  // namespace paddle
-
-static void BuildVar(const std::string& param_name,
-                     std::initializer_list<const char*> arguments,
-                     paddle::framework::proto::OpDesc::Var* var) {
-  var->set_parameter(param_name);
-  for (auto& arg_name : arguments) {
-    var->add_arguments(arg_name);
-  }
-}
-REGISTER_OP_WITHOUT_GRADIENT(cos_sim, paddle::framework::CosineOp,
-                             paddle::framework::CosineOpProtoAndCheckerMaker);
-REGISTER_OP_WITHOUT_GRADIENT(my_test_op, paddle::framework::MyTestOp,
-                             paddle::framework::MyTestOpProtoAndCheckerMaker);
-
-TEST(OpRegistry, CreateOp) {
-  paddle::framework::proto::OpDesc op_desc;
-  op_desc.set_type("cos_sim");
-  BuildVar("input", {"aa"}, op_desc.add_inputs());
-  BuildVar("output", {"bb"}, op_desc.add_outputs());
-
-  float scale = 3.3;
-  auto attr = op_desc.mutable_attrs()->Add();
-  attr->set_name("scale");
-  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
-  attr->set_f(scale);
-
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace cpu_place;
-  op->Run(scope, cpu_place);
-  float scale_get = op->Attr<float>("scale");
-  ASSERT_EQ(scale_get, scale);
-}
-
-TEST(OpRegistry, IllegalAttr) {
-  paddle::framework::proto::OpDesc op_desc;
-  op_desc.set_type("cos_sim");
-  BuildVar("input", {"aa"}, op_desc.add_inputs());
-  BuildVar("output", {"bb"}, op_desc.add_outputs());
-
-  auto attr = op_desc.mutable_attrs()->Add();
-  attr->set_name("scale");
-  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
-  attr->set_f(-2.0);
-
-  bool caught = false;
-  try {
-    paddle::framework::OpRegistry::CreateOp(op_desc);
-  } catch (paddle::platform::EnforceNotMet err) {
-    caught = true;
-    std::string msg = "larger_than check fail";
-    const char* err_msg = err.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(err_msg[i], msg[i]);
-    }
-  }
-  ASSERT_TRUE(caught);
-}
-
-TEST(OpRegistry, DefaultValue) {
-  paddle::framework::proto::OpDesc op_desc;
-  op_desc.set_type("cos_sim");
-  BuildVar("input", {"aa"}, op_desc.add_inputs());
-  BuildVar("output", {"bb"}, op_desc.add_outputs());
-
-  ASSERT_TRUE(op_desc.IsInitialized());
-
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace cpu_place;
-  op->Run(scope, cpu_place);
-  ASSERT_EQ(op->Attr<float>("scale"), 1.0);
-}
-
-TEST(OpRegistry, CustomChecker) {
-  paddle::framework::proto::OpDesc op_desc;
-  op_desc.set_type("my_test_op");
-  BuildVar("input", {"ii"}, op_desc.add_inputs());
-  BuildVar("output", {"oo"}, op_desc.add_outputs());
-
-  // attr 'test_attr' is not set
-  bool caught = false;
-  try {
-    paddle::framework::OpRegistry::CreateOp(op_desc);
-  } catch (paddle::platform::EnforceNotMet err) {
-    caught = true;
-    std::string msg = "Attribute 'test_attr' is required!";
-    const char* err_msg = err.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(err_msg[i], msg[i]);
-    }
-  }
-  ASSERT_TRUE(caught);
-
-  // set 'test_attr' set to an illegal value
-  auto attr = op_desc.mutable_attrs()->Add();
-  attr->set_name("test_attr");
-  attr->set_type(paddle::framework::proto::AttrType::INT);
-  attr->set_i(3);
-  caught = false;
-  try {
-    paddle::framework::OpRegistry::CreateOp(op_desc);
-  } catch (paddle::platform::EnforceNotMet err) {
-    caught = true;
-    std::string msg = "'test_attr' must be even!";
-    const char* err_msg = err.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(err_msg[i], msg[i]);
-    }
-  }
-  ASSERT_TRUE(caught);
-
-  // set 'test_attr' set to a legal value
-  op_desc.mutable_attrs()->Clear();
-  attr = op_desc.mutable_attrs()->Add();
-  attr->set_name("test_attr");
-  attr->set_type(paddle::framework::proto::AttrType::INT);
-  attr->set_i(4);
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  paddle::platform::CPUPlace cpu_place;
-  paddle::framework::Scope scope;
-  op->Run(scope, cpu_place);
-  int test_attr = op->Attr<int>("test_attr");
-  ASSERT_EQ(test_attr, 4);
-}
-
-class CosineOpComplete : public paddle::framework::CosineOp {
- public:
-  DEFINE_OP_CONSTRUCTOR(CosineOpComplete, paddle::framework::CosineOp);
-  DEFINE_OP_CLONE_METHOD(CosineOpComplete);
-};
-
-TEST(OperatorRegistrar, Test) {
-  using namespace paddle::framework;
-  OperatorRegistrar<CosineOpComplete, CosineOpProtoAndCheckerMaker> reg("cos");
-}
-
-namespace paddle {
-namespace framework {
-
-class OpKernelTestMaker : public OpProtoAndCheckerMaker {
- public:
-  OpKernelTestMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddComment("NoGradOp, same input output. no Grad");
-  }
-};
-
-class OpWithKernelTest : public OperatorWithKernel {
- public:
-  using OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(InferShapeContext* ctx) const override {}
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(proto::DataType::FP32, ctx.device_context());
-  }
-};
-
-template <typename DeviceContext, typename T>
-class OpKernelTest : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const {}
-};
-
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_OP_WITHOUT_GRADIENT(op_with_kernel,
-                             paddle::framework::OpWithKernelTest,
-                             paddle::framework::OpKernelTestMaker);
-REGISTER_OP_CPU_KERNEL(
-    op_with_kernel,
-    paddle::framework::OpKernelTest<paddle::platform::CPUDeviceContext, float>);
-
-REGISTER_OP_CUDA_KERNEL(op_with_kernel,
-                        paddle::framework::OpKernelTest<
-                            paddle::platform::CUDADeviceContext, float>);
-
-TEST(OperatorRegistrar, CPU) {
-  paddle::framework::proto::OpDesc op_desc;
-  paddle::platform::CPUPlace cpu_place;
-  paddle::framework::Scope scope;
-
-  op_desc.set_type("op_with_kernel");
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-
-  op->Run(scope, cpu_place);
-}
-
-TEST(OperatorRegistrar, CUDA) {
-  paddle::framework::proto::OpDesc op_desc;
-  paddle::platform::CUDAPlace cuda_place(0);
-  paddle::framework::Scope scope;
-
-  op_desc.set_type("op_with_kernel");
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-
-  op->Run(scope, cuda_place);
-}
-
-static int op_test_value = 0;
-
-using paddle::platform::DeviceContext;
-using paddle::platform::CPUDeviceContext;
-using paddle::platform::CUDADeviceContext;
-
-namespace paddle {
-namespace framework {
-
-class OpWithMultiKernelTest : public OperatorWithKernel {
- public:
-  using OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(InferShapeContext* ctx) const override {}
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        proto::DataType::FP32, platform::CUDAPlace(0), DataLayout::kAnyLayout,
-        framework::LibraryType::kCUDNN);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class OpMultiKernelTest : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const;
-};
-
-template <typename T>
-class OpMultiKernelTest<CPUDeviceContext, T>
-    : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const {
-    ++op_test_value;
-  }
-};
-
-template <typename T>
-class OpMultiKernelTest<CUDADeviceContext, T>
-    : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const {
-    --op_test_value;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class OpMultiKernelTest2 : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const;
-};
-
-template <typename T>
-class OpMultiKernelTest2<CPUDeviceContext, T>
-    : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const {
-    op_test_value += 10;
-  }
-};
-
-template <typename T>
-class OpMultiKernelTest2<CUDADeviceContext, T>
-    : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const {
-    op_test_value -= 10;
-  }
-};
-
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_OP_WITHOUT_GRADIENT(op_with_multi_kernel,
-                             paddle::framework::OpWithMultiKernelTest,
-                             paddle::framework::OpKernelTestMaker);
-REGISTER_OP_KERNEL(
-    op_with_multi_kernel, CPU, paddle::platform::CPUPlace,
-    paddle::framework::OpMultiKernelTest<CPUDeviceContext, float>);
-REGISTER_OP_KERNEL(
-    op_with_multi_kernel, MKLDNN, paddle::platform::CPUPlace,
-    paddle::framework::OpMultiKernelTest2<CPUDeviceContext, float>);
-REGISTER_OP_KERNEL(
-    op_with_multi_kernel, CUDA, paddle::platform::CUDAPlace,
-    paddle::framework::OpMultiKernelTest<CUDADeviceContext, float>);
-REGISTER_OP_KERNEL(
-    op_with_multi_kernel, CUDNN, paddle::platform::CUDAPlace,
-    paddle::framework::OpMultiKernelTest2<CUDADeviceContext, float>);
-
-TEST(OperatorRegistrar, OpWithMultiKernel) {
-  paddle::framework::proto::OpDesc op_desc;
-  paddle::platform::CUDAPlace cuda_place(0);
-  paddle::platform::CPUPlace cpu_place;
-  paddle::framework::Scope scope;
-
-  op_desc.set_type("op_with_multi_kernel");
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-
-  // TODO(qiao) add priority back
-  // use all available kernels
-  op->Run(scope, cuda_place);
-  EXPECT_EQ(op_test_value, -10);
-}
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
deleted file mode 100644
index 52387aabd9d0b41b13814499fb3f0107f42401e7..0000000000000000000000000000000000000000
--- a/paddle/framework/operator.cc
+++ /dev/null
@@ -1,597 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-
-#include <algorithm>
-
-#include "paddle/framework/data_transform.h"
-#include "paddle/framework/executor.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/shape_inference.h"
-#include "paddle/framework/var_type.h"
-
-DECLARE_bool(benchmark);
-
-namespace paddle {
-namespace framework {
-
-std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
-    std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
-    std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
-    std::make_tuple(platform::CPUPlace(), LibraryType::kMKLDNN),
-    std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
-};
-
-static DDim GetDims(const Scope& scope, const std::string& name) {
-  Variable* var = scope.FindVar(name);
-  if (var == nullptr) {
-    return DDim({-1});
-  }
-
-  if (var->IsType<LoDTensor>()) {
-    return var->Get<LoDTensor>().dims();
-  } else if (var->IsType<SelectedRows>()) {
-    return var->Get<SelectedRows>().GetCompleteDims();
-  } else {
-    return DDim({-1});
-  }
-}
-
-static LoD GetLoD(const Scope& scope, const std::string& name) {
-  Variable* var = scope.FindVar(name);
-  auto default_lod = LoD({{}});
-
-  if (var == nullptr) {
-    return default_lod;
-  }
-
-  if (var->IsType<LoDTensor>()) {
-    return var->Get<LoDTensor>().lod();
-  } else {
-    return default_lod;
-  }
-}
-
-std::string OperatorBase::Input(const std::string& name) const {
-  auto& ins = Inputs(name);
-  PADDLE_ENFORCE_LE(ins.size(), 1UL,
-                    "Operator %s's input %s should contain only one variable.",
-                    type_, name);
-  return ins.empty() ? kEmptyVarName : ins[0];
-}
-
-const std::vector<std::string>& OperatorBase::Inputs(
-    const std::string& name) const {
-  auto it = inputs_.find(name);
-  PADDLE_ENFORCE(it != inputs_.end(), "Operator %s does not have the input %s.",
-                 type_, name);
-  return it->second;
-}
-
-std::string OperatorBase::Output(const std::string& name) const {
-  auto& outs = Outputs(name);
-  PADDLE_ENFORCE_LE(outs.size(), 1UL,
-                    "Operator %s's output %s should contain only one variable.",
-                    type_, name);
-  return outs.empty() ? kEmptyVarName : outs[0];
-}
-
-const std::vector<std::string>& OperatorBase::Outputs(
-    const std::string& name) const {
-  auto it = outputs_.find(name);
-  PADDLE_ENFORCE(it != outputs_.end(),
-                 "Operator %s does not have an output called %s.", type_, name);
-  return it->second;
-}
-
-std::string OperatorBase::DebugStringEx(const Scope* scope) const {
-  std::stringstream ss;
-  ss << "Op(" << type_ << "), inputs:{";
-  for (auto it = inputs_.begin(); it != inputs_.end();) {
-    auto& input = *it;
-    ss << input.first << "[";
-    for (size_t i = 0; i < input.second.size(); ++i) {
-      ss << input.second[i];
-      if (scope) {
-        ss << "[" << GetDims(*scope, input.second[i]) << "]";
-        ss << "(" << GetLoD(*scope, input.second[i]) << ")";
-      }
-      if (i != input.second.size() - 1) {
-        ss << ", ";
-      }
-    }
-    ss << "]";
-    ++it;
-    if (it != inputs_.end()) {
-      ss << ", ";
-    }
-  }
-  ss << "}, outputs:{";
-  for (auto it = outputs_.begin(); it != outputs_.end();) {
-    auto& output = *it;
-    ss << output.first << "[";
-    for (size_t i = 0; i < output.second.size(); ++i) {
-      ss << output.second[i];
-      if (scope) {
-        ss << "[" << GetDims(*scope, output.second[i]) << "]";
-        ss << "(" << GetLoD(*scope, output.second[i]) << ")";
-      }
-      if (i != output.second.size() - 1) {
-        ss << ", ";
-      }
-    }
-    ss << "]";
-    ++it;
-    if (it != outputs_.end()) {
-      ss << ", ";
-    }
-  }
-  ss << "}.";
-  return ss.str();
-}
-
-void OperatorBase::Rename(const std::string& old_name,
-                          const std::string& new_name) {
-  for (auto& input : inputs_) {
-    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
-  }
-  for (auto& output : outputs_) {
-    std::replace(output.second.begin(), output.second.end(), old_name,
-                 new_name);
-  }
-}
-
-OperatorBase::OperatorBase(const std::string& type,
-                           const VariableNameMap& inputs,
-                           const VariableNameMap& outputs,
-                           const AttributeMap& attrs)
-    : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) {
-  GenerateTemporaryNames();
-  CheckAllInputOutputSet();
-}
-
-std::vector<std::string> OperatorBase::InputVars() const {
-  std::vector<std::string> ret_val;
-  for (auto& o : inputs_) {
-    ret_val.reserve(ret_val.size() + o.second.size());
-    ret_val.insert(ret_val.end(), o.second.begin(), o.second.end());
-  }
-  return ret_val;
-}
-
-std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
-  std::vector<std::string> ret_val;
-  if (has_intermediate) {
-    // push all outputs into ret_val
-    for (auto& o : outputs_) {
-      ret_val.reserve(ret_val.size() + o.second.size());
-      ret_val.insert(ret_val.end(), o.second.begin(), o.second.end());
-    }
-    return ret_val;
-  }
-  auto& info = OpInfoMap::Instance().Get(Type());
-
-  // get all OpProto::Var for outputs
-  for (auto& o : info.Proto().outputs()) {
-    // ignore all intermediate output
-    if (o.intermediate()) continue;
-    auto out = outputs_.find(o.name());
-    if (out != outputs_.end()) {
-      ret_val.reserve(ret_val.size() + out->second.size());
-      ret_val.insert(ret_val.end(), out->second.begin(), out->second.end());
-    }
-  }
-  return ret_val;
-}
-
-void OperatorBase::CheckAllInputOutputSet() const {
-  auto& info_map = OpInfoMap::Instance();
-  auto* op_info = info_map.GetNullable(Type());
-  if (op_info == nullptr || op_info->proto_ == nullptr) return;
-
-  for (auto& in : op_info->Proto().inputs()) {
-    PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
-                   "Type %s's input %s is not set", Type(), in.name());
-  }
-
-  for (auto& out : op_info->Proto().outputs()) {
-    PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
-                   "Type %s's output %s is not set", Type(), out.name());
-  }
-}
-
-void OperatorBase::GenerateTemporaryNames() {
-  static std::atomic<size_t> gUniqId(0UL);
-  for (auto& output : outputs_) {
-    for (auto& output_name : output.second) {
-      if (output_name == kTempVarName) {
-        output_name += type_;
-        output_name += "@";
-        output_name += std::to_string(gUniqId.fetch_add(1));
-      }
-    }
-  }
-}
-
-static bool VarIsTensor(const Variable* var) {
-  return var->IsType<LoDTensor>() || var->IsType<SelectedRows>();
-}
-
-static const Tensor* GetTensorFromVar(Variable* var) {
-  if (var->IsType<LoDTensor>()) {
-    return var->GetMutable<LoDTensor>();
-  } else if (var->IsType<SelectedRows>()) {
-    return var->GetMutable<SelectedRows>()->mutable_value();
-  } else {
-    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                 var->Type().name());
-  }
-}
-
-static Tensor* GetMutableTensorFromVar(Variable* var) {
-  if (var->IsType<LoDTensor>()) {
-    return var->GetMutable<LoDTensor>();
-  } else if (var->IsType<SelectedRows>()) {
-    return var->GetMutable<SelectedRows>()->mutable_value();
-  } else {
-    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                 var->Type().name());
-  }
-}
-
-template <>
-const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
-  auto* var = InputVar(name);
-  return var == nullptr ? nullptr
-                        : GetTensorFromVar(const_cast<Variable*>(var));
-}
-
-template <>
-const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
-    const std::string& name) const {
-  auto names = op().Inputs(name);
-  std::vector<const Tensor*> res;
-  res.reserve(names.size());
-  std::transform(names.begin(), names.end(), std::back_inserter(res),
-                 [&](const std::string& sub_name) {
-                   auto var = scope_.FindVar(sub_name);
-                   return var == nullptr ? nullptr : GetTensorFromVar(var);
-                 });
-  return res;
-}
-
-template <>
-Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
-  auto var = OutputVar(name);
-  return var == nullptr ? nullptr : GetMutableTensorFromVar(var);
-}
-
-template <>
-std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
-    const std::string& name) const {
-  auto names = op().Outputs(name);
-  std::vector<Tensor*> res;
-  res.reserve(names.size());
-  std::transform(names.begin(), names.end(), std::back_inserter(res),
-                 [&](const std::string& sub_name) {
-                   auto var = scope_.FindVar(sub_name);
-                   return var == nullptr ? nullptr
-                                         : GetMutableTensorFromVar(var);
-                 });
-  return res;
-}
-
-bool OpSupportGPU(const std::string& op_type) {
-  auto& all_kernels = OperatorWithKernel::AllOpKernels();
-  auto it = all_kernels.find(op_type);
-  if (it == all_kernels.end()) {
-    // All control operator must support GPU
-
-    return true;
-  }
-  for (auto& kern_pair : it->second) {
-    if (platform::is_gpu_place(kern_pair.first.place_)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-class RuntimeInferShapeContext : public InferShapeContext {
- public:
-  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
-      : op_(op), scope_(scope) {}
-
-  bool HasInput(const std::string& name) const override {
-    auto& ins = Inputs(name);
-    size_t length = ins.size();
-    if (length == 0) {
-      return false;
-    }
-    PADDLE_ENFORCE_EQ(length, 1UL,
-                      "Input %s should not have more than one inputs", name);
-    auto ipt = ins[0];
-    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-    return var != nullptr;
-  }
-
-  bool HasOutput(const std::string& name) const override {
-    auto& outs = Outputs(name);
-    size_t length = outs.size();
-    if (length == 0) {
-      return false;
-    }
-    PADDLE_ENFORCE_EQ(length, 1UL,
-                      "Output %s should not have more than one inputs", name);
-    auto ipt = outs[0];
-    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-    return var != nullptr;
-  }
-
-  bool HasInputs(const std::string& name) const override {
-    auto inputs = op_.Inputs(name);
-    if (inputs.empty()) {
-      return false;
-    }
-    for (auto& input : inputs) {
-      if (scope_.FindVar(input) == nullptr) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  bool HasOutputs(const std::string& name) const override {
-    auto outputs = op_.Outputs(name);
-    if (outputs.empty()) {
-      return false;
-    }
-    for (auto& output : outputs) {
-      if (scope_.FindVar(output) == nullptr) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
-
-  const std::vector<std::string>& Inputs(
-      const std::string& name) const override {
-    return op_.Inputs(name);
-  }
-
-  const std::vector<std::string>& Outputs(
-      const std::string& name) const override {
-    return op_.Outputs(name);
-  }
-
-  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
-                size_t j = 0) const override {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
-    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
-    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
-    if (!in_var->IsType<LoDTensor>()) return;
-    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
-    auto in_tensor = in_var->Get<LoDTensor>();
-    auto* out_tensor = out_var->GetMutable<LoDTensor>();
-    out_tensor->set_lod(in_tensor.lod());
-
-    // TODO(dzhwinter) : reuse ShareLoD in most operators.
-    // Need to call ShareLayout explicitly in sequence related ops.
-    // Shall we have a better method to shared info between in/out Tensor?
-    out_tensor->set_layout(in_tensor.layout());
-  }
-
-  void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
-                   size_t j = 0) const {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
-    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
-    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
-    if (!in_var->IsType<LoDTensor>()) return;
-    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
-    auto in_tensor = in_var->Get<LoDTensor>();
-    auto* out_tensor = out_var->GetMutable<LoDTensor>();
-    out_tensor->set_layout(in_tensor.layout());
-  }
-
-  bool IsRuntime() const override { return true; }
-
- protected:
-  DDim GetDim(const std::string& name) const override {
-    Variable* var = scope_.FindVar(name);
-    if (var->IsType<LoDTensor>()) {
-      return var->Get<LoDTensor>().dims();
-    } else if (var->IsType<SelectedRows>()) {
-      return var->Get<SelectedRows>().GetCompleteDims();
-    } else {
-      PADDLE_THROW(
-          "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's "
-          "type_id is %s.",
-          name, var->Type().name());
-    }
-  }
-
-  std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
-    Variable* var = scope_.FindVar(name);
-    if (var->IsType<ReaderHolder>()) {
-      return var->Get<ReaderHolder>().shapes();
-    } else {
-      PADDLE_THROW(
-          "Only ReaderHolder support 'GetRepeatedDims', but Variable %s's "
-          "type_id is %s.",
-          name, var->Type().name());
-    }
-  }
-
-  void SetDim(const std::string& name, const DDim& dim) override {
-    Variable* var = scope_.FindVar(name);
-    if (var->IsType<LoDTensor>()) {
-      var->GetMutable<LoDTensor>()->Resize(dim);
-    } else if (var->IsType<SelectedRows>()) {
-      var->GetMutable<SelectedRows>()->set_height(dim[0]);
-    } else {
-      PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
-                   name, var->Type().name());
-    }
-  }
-
-  void SetRepeatedDims(const std::string& name,
-                       const std::vector<DDim>& dims) override {
-    Variable* var = scope_.FindVar(name);
-    if (var->IsType<ReaderHolder>()) {
-      var->GetMutable<ReaderHolder>()->set_shapes(dims);
-    } else {
-      PADDLE_THROW(
-          "Only ReaderHolder support 'SetRepeatedDims', but Variable %s's "
-          "type_id is %s.",
-          name, var->Type().name());
-    }
-  }
-
-  proto::VarDesc::VarType GetVarType(const std::string& name) const override {
-    auto* var = scope_.FindVar(name);
-    return ToVarType(var->Type());
-  }
-
- private:
-  const OperatorBase& op_;
-  const Scope& scope_;
-};
-
-void OperatorWithKernel::Run(const Scope& scope,
-                             const platform::Place& place) const {
-  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
-  this->InferShape(&infer_shape_ctx);
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto dev_ctx = pool.Get(place);
-
-  // check if op[type] has kernel registered.
-  auto& all_op_kernels = AllOpKernels();
-  auto kernels_iter = all_op_kernels.find(type_);
-  if (kernels_iter == all_op_kernels.end()) {
-    PADDLE_THROW(
-        "There are no kernels which are registered in the %s operator.", type_);
-  }
-
-  ExecutionContext ctx(*this, scope, *dev_ctx);
-
-  OpKernelMap& kernels = kernels_iter->second;
-
-  // TODO(dzhwinter) : kernel fallback mechanism will be added when all the
-  // transform functions are ready.
-
-  // for (auto& candidate : kKernelPriority) {
-  //   Do selection
-  // }
-
-  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
-
-  auto kernel_iter = kernels.find(expected_kernel_key);
-  if (kernel_iter == kernels.end()) {
-    PADDLE_THROW("op %s does not have kernel for %s", type_,
-                 KernelTypeToString(expected_kernel_key));
-  }
-
-  // do data transform
-  Scope& new_scope = scope.NewScope();
-
-  for (auto& var_name_item : this->Inputs()) {
-    for (auto& var_name : var_name_item.second) {
-      auto* var = scope.FindVar(var_name);
-      if (var && VarIsTensor(var)) {
-        auto* tensor_in = GetTensorFromVar(var);
-        if (tensor_in->IsInitialized()) {
-          auto kernel_type_for_var = this->GetKernelTypeForVar(
-              var_name_item.first, *tensor_in, expected_kernel_key);
-          if (TransFromNeeded(kernel_type_for_var, expected_kernel_key)) {
-            auto out_var_names = OutputVars(true);
-            if (std::find(out_var_names.begin(), out_var_names.end(),
-                          var_name) != out_var_names.end()) {
-              PADDLE_THROW(
-                  "var %s is both input and output, "
-                  "does not support transform",
-                  var_name);
-            }
-            VLOG(3) << "Transform Variable " << var_name << " from "
-                    << kernel_type_for_var << " to " << expected_kernel_key;
-            auto* trans_var = new_scope.Var(var_name);
-            std::shared_ptr<Tensor> out(new Tensor);
-            DataTransform(expected_kernel_key, kernel_type_for_var, *tensor_in,
-                          out.get());
-            CopyVariableWithTensor(*var, *(out.get()), *trans_var);
-          }
-        }
-      }
-    }
-  }
-
-  auto* new_dev_ctx = pool.Get(expected_kernel_key.place_);
-  kernel_iter->second->Compute(
-      ExecutionContext(*this, new_scope, *new_dev_ctx));
-
-  /*For profiling/benchmark only*/
-  if (FLAGS_benchmark) {
-    new_dev_ctx->Wait();
-  }
-}
-
-proto::DataType OperatorWithKernel::IndicateDataType(
-    const ExecutionContext& ctx) const {
-  auto& scope = ctx.scope();
-  int data_type = -1;
-  for (auto& input : this->inputs_) {
-    for (auto& ipt_name : input.second) {
-      auto* var = scope.FindVar(ipt_name);
-      if (var != nullptr) {
-        const Tensor* t = nullptr;
-        if (var->IsType<Tensor>()) {
-          t = &var->Get<Tensor>();
-        } else if (var->IsType<LoDTensor>()) {
-          t = &var->Get<LoDTensor>();
-        } else if (var->IsType<SelectedRows>()) {
-          t = &(var->Get<SelectedRows>().value());
-        }
-        if (t != nullptr) {
-          int tmp = static_cast<int>(ToDataType(t->type()));
-          PADDLE_ENFORCE(tmp == data_type || data_type == -1,
-                         "DataType of Paddle Op %s must be the same.", Type());
-          data_type = tmp;
-        }
-      }
-    }
-  }
-  PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
-  return static_cast<proto::DataType>(data_type);
-}
-
-OpKernelType OperatorWithKernel::GetExpectedKernelType(
-    const ExecutionContext& ctx) const {
-  return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
-}
-
-OpKernelType OperatorWithKernel::GetKernelTypeForVar(
-    const std::string& var_name, const Tensor& tensor,
-    const OpKernelType& expected_kernel_type) const {
-  return OpKernelType(expected_kernel_type.data_type_, tensor.place());
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
deleted file mode 100644
index c9140f304c89e32a0fa8bd24722cc2e5dbc4e2e1..0000000000000000000000000000000000000000
--- a/paddle/framework/operator.h
+++ /dev/null
@@ -1,401 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <atomic>
-#include <string>
-#include <tuple>
-#include <unordered_map>
-#include <vector>
-
-#include "glog/logging.h"  // For VLOG
-#include "paddle/framework/attribute.h"
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_info.h"
-#include "paddle/framework/op_kernel_type.h"
-#include "paddle/framework/scope.h"
-#include "paddle/framework/selected_rows.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/variant.h"
-#include "paddle/utils/Error.h"
-
-namespace paddle {
-namespace framework {
-
-/// If a variable is a empty variable, that name will be used.
-constexpr char kEmptyVarName[] = "@EMPTY@";
-
-/// If a variable is a temporary variable, that name will be set in Python,
-/// but it will be convert to a unique name in scope after OpCreator.
-constexpr char kTempVarName[] = "@TEMP@";
-
-/// If a variable's name has a certain suffix, it means that the
-/// variable is the gradient of another varibale.
-/// e.g. Variable "x@GRAD" is the gradient of varibale "x".
-constexpr char kGradVarSuffix[] = "@GRAD";
-
-/// Variables with this suffix are supposed to be filled up with zeros.
-constexpr char kZeroVarSuffix[] = "@ZERO";
-
-// define some kernel priority
-/* Define multiple kernel type fallback order*/
-extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
-
-inline std::string GradVarName(const std::string& var_name) {
-  return var_name + kGradVarSuffix;
-}
-
-class OperatorBase;
-class ExecutionContext;
-
-/**
- * OperatorBase has the basic element that Net will call to do computation.
- * Only CreateOperator from OpRegistry will new Operator directly. User
- * should always construct a proto message OpDesc and call
- * OpRegistry::CreateOp(op_desc) to get an Operator instance.
- */
-class OperatorBase {
- public:
-  OperatorBase(const std::string& type, const VariableNameMap& inputs,
-               const VariableNameMap& outputs, const AttributeMap& attrs);
-
-  virtual ~OperatorBase() {}
-
-  template <typename T>
-  inline const T& Attr(const std::string& name) const {
-    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
-                   name);
-    return boost::get<T>(attrs_.at(name));
-  }
-
-  /// if scope is not null, also show dimensions of arguments
-  virtual std::string DebugStringEx(const Scope* scope) const;
-
-  std::string DebugString() const { return DebugStringEx(nullptr); }
-
-  /// Net will call this function to Run an op.
-  virtual void Run(const Scope& scope, const platform::Place& place) const = 0;
-
-  // FIXME(typhoonzero): this is only used for recv_op to stop event_loop.
-  virtual void Stop() {}
-
-  virtual bool IsNetOp() const { return false; }
-
-  virtual bool SupportGPU() const { return false; }
-
-  /// rename inputs outputs name
-  void Rename(const std::string& old_name, const std::string& new_name);
-
-  const VariableNameMap& Inputs() const { return inputs_; }
-  const VariableNameMap& Outputs() const { return outputs_; }
-
-  //! Get a input with argument's name described in `op_proto`
-  std::string Input(const std::string& name) const;
-  //! Get a input which has multiple variables.
-  const std::vector<std::string>& Inputs(const std::string& name) const;
-
-  std::vector<std::string> InputVars() const;
-
-  //! Get a output with argument's name described in `op_proto`
-  std::string Output(const std::string& name) const;
-  //! Get an output which has multiple variables.
-  //! TODO add a vector_view to prevent memory copy.
-  const std::vector<std::string>& Outputs(const std::string& name) const;
-
-  virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
-
-  const std::string& Type() const { return type_; }
-  void SetType(const std::string& type) { type_ = type; }
-  const AttributeMap& Attrs() const { return attrs_; }
-
-  // Return a new operator instance, which is as same as this.
-  // Use unique_ptr to prevent caller forget to delete this pointer.
-  virtual std::unique_ptr<OperatorBase> Clone() const = 0;
-
- protected:
-  std::string type_;
-  // NOTE: in case of OpGrad, inputs_ contains:
-  // I (Inputs)
-  // O (Outputs)
-  // OG (Output Gradients)
-  VariableNameMap inputs_;
-
-  // NOTE: in case of OpGrad, outputs_ contains
-  // IG (Inputs Gradients)
-  VariableNameMap outputs_;
-  AttributeMap attrs_;
-
- private:
-  void GenerateTemporaryNames();
-  void CheckAllInputOutputSet() const;
-};
-
-// Macro for define a clone method.
-// If you are writing an kernel operator, `Clone` will be defined when you
-// register it. i.e. `Clone` method is not needed to define by yourself.
-#define DEFINE_OP_CLONE_METHOD(cls)                                            \
-  std::unique_ptr<::paddle::framework::OperatorBase> Clone() const final {     \
-    return std::unique_ptr<::paddle::framework::OperatorBase>(new cls(*this)); \
-  }
-
-// Macro for define a default constructor for Operator.
-// You can also use
-//   using PARENT_CLASS::PARENT_CLASS;
-// to use parent's constructor.
-#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls)             \
-  cls(const std::string& type,                             \
-      const ::paddle::framework::VariableNameMap& inputs,  \
-      const ::paddle::framework::VariableNameMap& outputs, \
-      const paddle::framework::AttributeMap& attrs)        \
-      : parent_cls(type, inputs, outputs, attrs) {}
-
-class NOP : public OperatorBase {
- public:
-  using OperatorBase::OperatorBase;
-  void Run(const Scope& scope, const platform::Place& place) const override {}
-  std::unique_ptr<OperatorBase> Clone() const override {
-    return std::unique_ptr<OperatorBase>(new NOP(*this));
-  }
-};
-
-class ExecutionContext {
- public:
-  ExecutionContext(const OperatorBase& op, const Scope& scope,
-                   const platform::DeviceContext& device_context)
-      : op_(op), scope_(scope), device_context_(device_context) {}
-
-  const OperatorBase& op() const { return op_; }
-
-  const Scope& scope() const { return scope_; }
-
-  template <typename T>
-  inline const T& Attr(const std::string& name) const {
-    return op_.Attr<T>(name);
-  }
-
-  size_t InputSize(const std::string& name) const {
-    return op_.Inputs(name).size();
-  }
-
-  size_t OutputSize(const std::string& name) const {
-    return op_.Outputs(name).size();
-  }
-
-  const Variable* InputVar(const std::string& name) const {
-    auto ipt = op_.Input(name);
-    return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-  }
-
-  Variable* OutputVar(const std::string& name) const {
-    auto opt = op_.Output(name);
-    return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt);
-  }
-
-  const std::vector<const Variable*> MultiInputVar(
-      const std::string& name) const {
-    auto names = op_.Inputs(name);
-    std::vector<const Variable*> res;
-    res.reserve(names.size());
-    std::transform(names.begin(), names.end(), std::back_inserter(res),
-                   [this](const std::string& name) {
-                     return name == kEmptyVarName ? nullptr
-                                                  : scope_.FindVar(name);
-                   });
-    return res;
-  }
-
-  std::vector<Variable*> MultiOutputVar(const std::string& name) const {
-    auto names = op_.Outputs(name);
-    std::vector<Variable*> res;
-    res.reserve(names.size());
-    std::transform(names.begin(), names.end(), std::back_inserter(res),
-                   [this](const std::string& name) {
-                     return name == kEmptyVarName ? nullptr
-                                                  : scope_.FindVar(name);
-                   });
-    return res;
-  }
-
-  template <typename T>
-  const T* Input(const std::string& name) const {
-    auto* var = InputVar(name);
-    return var == nullptr ? nullptr : &var->Get<T>();
-  }
-
-  template <typename T>
-  T* Output(const std::string& name) const {
-    auto var = OutputVar(name);
-    return var == nullptr ? nullptr : var->GetMutable<T>();
-  }
-
-  template <typename T>
-  const std::vector<const T*> MultiInput(const std::string& name) const {
-    auto names = op_.Inputs(name);
-    std::vector<const T*> res;
-    res.reserve(names.size());
-    std::transform(names.begin(), names.end(), std::back_inserter(res),
-                   [&](const std::string& sub_name) {
-                     auto var = scope_.FindVar(sub_name);
-                     return var == nullptr ? nullptr : &var->Get<T>();
-                   });
-    return res;
-  }
-
-  template <typename T>
-  std::vector<T*> MultiOutput(const std::string& name) const {
-    auto names = op_.Outputs(name);
-    std::vector<T*> res;
-    res.reserve(names.size());
-    std::transform(names.begin(), names.end(), std::back_inserter(res),
-                   [&](const std::string& sub_name) {
-                     auto var = scope_.FindVar(sub_name);
-                     return var == nullptr ? nullptr : var->GetMutable<T>();
-                   });
-    return res;
-  }
-
-  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
-                size_t j = 0) const {
-    PADDLE_ENFORCE_LT(i, InputSize(in));
-    PADDLE_ENFORCE_LT(j, OutputSize(out));
-    auto* in_var = MultiInputVar(in)[i];
-    auto* out_var = MultiOutputVar(out)[j];
-    if (!in_var->IsType<LoDTensor>()) return;
-    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
-    auto in_tensor = in_var->Get<LoDTensor>();
-    auto* out_tensor = out_var->GetMutable<LoDTensor>();
-    out_tensor->set_lod(in_tensor.lod());
-  }
-
-  platform::Place GetPlace() const { return device_context_.GetPlace(); }
-
-  template <typename DeviceContextType>
-  const DeviceContextType& device_context() const {
-    return *reinterpret_cast<const DeviceContextType*>(&device_context_);
-  }
-
-  const platform::DeviceContext& device_context() const {
-    return device_context_;
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  const inline platform::CUDADeviceContext& cuda_device_context() const {
-    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
-    return *reinterpret_cast<const platform::CUDADeviceContext*>(
-        &device_context_);
-  }
-#endif
-
-  //! Get actual name vector for this input.
-  const std::vector<std::string>& Inputs(const std::string& name) const {
-    return op_.Inputs(name);
-  }
-
-  //! Get actual name vector for this output.
-  const std::vector<std::string>& Outputs(const std::string& name) const {
-    return op_.Outputs(name);
-  }
-
- private:
-  const OperatorBase& op_;
-  const Scope& scope_;
-  const platform::DeviceContext& device_context_;
-};
-
-template <>
-const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const;
-
-template <>
-const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
-    const std::string& name) const;
-
-template <>
-Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
-
-template <>
-std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
-    const std::string& name) const;
-
-class OpKernelBase {
- public:
-  /**
-   * ExecutionContext is the only parameter of Kernel Run function.
-   * Run will get input/output variables, state such as momentum and
-   * device resource such as CUDA stream, cublas handle, etc. from
-   * ExecutionContext. User should construct it before run the Operator.
-   */
-
-  virtual void Compute(const ExecutionContext& context) const = 0;
-
-  virtual ~OpKernelBase() = default;
-};
-
-template <typename T>
-class OpKernel : public OpKernelBase {
- public:
-  using ELEMENT_TYPE = T;
-};
-
-class OperatorWithKernel : public OperatorBase {
- public:
-  using OpKernelMap =
-      std::unordered_map<OpKernelType, std::unique_ptr<OpKernelBase>,
-                         OpKernelType::Hash>;
-
-  OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
-                     const VariableNameMap& outputs, const AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void Run(const Scope& scope, const platform::Place& place) const final;
-
-  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
-  AllOpKernels() {
-    static std::unordered_map<std::string, OpKernelMap> g_all_op_kernels;
-    return g_all_op_kernels;
-  }
-
-  bool SupportGPU() const override {
-    auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
-    return std::any_of(op_kernels.begin(), op_kernels.end(),
-                       [](OpKernelMap::const_reference kern_pair) {
-                         return platform::is_gpu_place(kern_pair.first.place_);
-                       });
-  }
-
-  virtual void InferShape(InferShapeContext* ctx) const {
-    OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
-  }
-
- protected:
-  virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
-  virtual OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
-      const OpKernelType& expected_kernel_type) const;
-
- private:
-  // indicate kernel DataType by input data. Defaultly all input data must be
-  // same.
-  proto::DataType IndicateDataType(const ExecutionContext& ctx) const;
-};
-
-extern bool OpSupportGPU(const std::string& op_type);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
deleted file mode 100644
index b69d7c7a7406eb3e18d385c568cb9c21b9b4107b..0000000000000000000000000000000000000000
--- a/paddle/framework/operator_test.cc
+++ /dev/null
@@ -1,273 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "gtest/gtest.h"
-
-#include "paddle/framework/init.h"
-#include "paddle/framework/op_info.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-
-static int op_run_num = 0;
-
-class OpWithoutKernelTest : public OperatorBase {
- public:
-  OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
-                      const VariableNameMap& outputs, const AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs), x(1) {}
-  void Run(const Scope& scope, const platform::Place& place) const override {
-    ++op_run_num;
-    ASSERT_EQ(static_cast<int>(inputs_.size()), 1);
-    ASSERT_EQ(static_cast<int>(outputs_.size()), 1);
-    ASSERT_EQ(scope.FindVar(inputs_.at("input")[0]), nullptr);
-    ASSERT_EQ(x, 1);
-    ASSERT_NE(scope.FindVar(outputs_.at("output")[0]), nullptr);
-  }
-
- public:
-  int x{0};
-};
-
-class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  OpWithoutKernelCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of test op");
-    AddOutput("output", "output of test op");
-    AddAttr<float>("scale", "scale of cosine op");
-    AddComment("This is test op");
-  }
-};
-
-}  // namespace framework
-}  // namespace paddle
-
-static void BuildVar(const std::string& param_name,
-                     std::initializer_list<const char*> arguments,
-                     paddle::framework::proto::OpDesc::Var* var) {
-  var->set_parameter(param_name);
-  for (auto& arg_name : arguments) {
-    *var->mutable_arguments()->Add() = arg_name;
-  }
-}
-
-REGISTER_OP_WITHOUT_GRADIENT(test_operator,
-                             paddle::framework::OpWithoutKernelTest,
-                             paddle::framework::OpWithoutKernelCheckerMaker);
-
-TEST(OperatorBase, all) {
-  paddle::framework::InitDevices();
-  paddle::framework::proto::OpDesc op_desc;
-  op_desc.set_type("test_operator");
-  BuildVar("input", {"IN1"}, op_desc.add_inputs());
-  BuildVar("output", {"OUT1"}, op_desc.add_outputs());
-
-  auto attr = op_desc.mutable_attrs()->Add();
-  attr->set_name("scale");
-  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
-  attr->set_f(3.14);
-
-  paddle::platform::CPUPlace cpu_place;
-  paddle::framework::Scope scope;
-
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  scope.Var("OUT1");
-  ASSERT_EQ(paddle::framework::op_run_num, 0);
-  op->Run(scope, cpu_place);
-  ASSERT_EQ(paddle::framework::op_run_num, 1);
-}
-
-namespace paddle {
-namespace framework {
-
-class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("x", "input of test op");
-    AddOutput("y", "output of test op");
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .GreaterThan(0.0);
-    AddComment("This is test op");
-  }
-};
-
-static int cpu_kernel_run_num = 0;
-
-class OpWithKernelTest : public OperatorWithKernel {
- public:
-  using OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-  OpKernelType GetExpectedKernelType(
-      const ExecutionContext& ctx) const override {
-    return OpKernelType(proto::DataType::FP32, ctx.GetPlace());
-  }
-};
-
-template <typename T1, typename T2>
-class CPUKernelTest : public OpKernel<float> {
- public:
-  void Compute(const ExecutionContext& ctx) const {
-    std::cout << ctx.op().DebugString() << std::endl;
-    cpu_kernel_run_num++;
-    ASSERT_EQ(ctx.op().Input("x"), "IN1");
-    ASSERT_EQ(ctx.op().Output("y"), "OUT1");
-  }
-};
-
-class OpKernelTestMultiInputsProtoAndCheckerMaker
-    : public OpProtoAndCheckerMaker {
- public:
-  OpKernelTestMultiInputsProtoAndCheckerMaker(OpProto* proto,
-                                              OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("xs", "inputs of test op").AsDuplicable();
-    AddInput("k", "input of test op");
-    AddOutput("ys", "outputs of test op").AsDuplicable();
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .GreaterThan(0.0);
-    AddComment("This is test op");
-  }
-};
-
-class CPUKernalMultiInputsTest : public OpKernel<float> {
- public:
-  void Compute(const ExecutionContext& ctx) const {
-    auto xs = ctx.op().Inputs("xs");
-    ASSERT_EQ(xs.size(), 3UL);
-    ASSERT_EQ(xs[0], "x0");
-    ASSERT_EQ(xs[1], "x1");
-    ASSERT_EQ(xs[2], "x2");
-
-    auto inVar0 = ctx.MultiInputVar("xs");
-    ASSERT_EQ(inVar0.size(), 3U);
-
-    auto intVar1 = ctx.InputVar("k");
-    ASSERT_NE(intVar1, nullptr);
-
-    auto outVar0 = ctx.MultiOutputVar("ys");
-    ASSERT_EQ(outVar0.size(), 2U);
-
-    auto inTensor0 = ctx.MultiInput<Tensor>("xs");
-    ASSERT_EQ(inTensor0.size(), 3U);
-
-    auto intTensor1 = ctx.Input<Tensor>("k");
-    ASSERT_NE(intTensor1, nullptr);
-
-    auto outTensor0 = ctx.MultiOutput<Tensor>("ys");
-    ASSERT_EQ(outTensor0.size(), 2U);
-
-    auto k = ctx.op().Input("k");
-    ASSERT_EQ(k, "k0");
-
-    auto ys = ctx.op().Outputs("ys");
-    ASSERT_EQ(ys.size(), 2UL);
-    ASSERT_EQ(ys[0], "y0");
-    ASSERT_EQ(ys[1], "y1");
-  }
-};
-
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_OP_WITHOUT_GRADIENT(
-    op_with_kernel, paddle::framework::OpWithKernelTest,
-    paddle::framework::OpKernelTestProtoAndCheckerMaker);
-REGISTER_OP_CPU_KERNEL(op_with_kernel,
-                       paddle::framework::CPUKernelTest<float, float>);
-
-// test with single input
-TEST(OpKernel, all) {
-  paddle::framework::InitDevices();
-  paddle::framework::proto::OpDesc op_desc;
-  op_desc.set_type("op_with_kernel");
-  BuildVar("x", {"IN1"}, op_desc.add_inputs());
-  BuildVar("y", {"OUT1"}, op_desc.add_outputs());
-
-  auto attr = op_desc.mutable_attrs()->Add();
-  attr->set_name("scale");
-  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
-  attr->set_f(3.14);
-
-  paddle::platform::CPUPlace cpu_place;
-  paddle::framework::Scope scope;
-
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
-  op->Run(scope, cpu_place);
-  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
-}
-
-REGISTER_OP_WITHOUT_GRADIENT(
-    op_multi_inputs_with_kernel, paddle::framework::OpWithKernelTest,
-    paddle::framework::OpKernelTestMultiInputsProtoAndCheckerMaker);
-REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
-                       paddle::framework::CPUKernalMultiInputsTest);
-
-// test with multi inputs
-TEST(OpKernel, multi_inputs) {
-  using namespace paddle::framework;
-
-  paddle::framework::InitDevices();
-  proto::OpDesc op_desc;
-
-  op_desc.set_type("op_multi_inputs_with_kernel");
-  BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs());
-  BuildVar("k", {"k0"}, op_desc.add_inputs());
-  BuildVar("ys", {"y0", "y1"}, op_desc.add_outputs());
-
-  auto attr = op_desc.mutable_attrs()->Add();
-  attr->set_name("scale");
-  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
-  attr->set_f(3.14);
-
-  paddle::platform::CPUPlace cpu_place;
-  paddle::framework::Scope scope;
-  scope.Var("x0")->GetMutable<LoDTensor>();
-  scope.Var("x1")->GetMutable<LoDTensor>();
-  scope.Var("x2")->GetMutable<LoDTensor>();
-  scope.Var("k0")->GetMutable<LoDTensor>();
-  scope.Var("y0")->GetMutable<LoDTensor>();
-  scope.Var("y1")->GetMutable<LoDTensor>();
-
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  op->Run(scope, cpu_place);
-}
-
-class OperatorClone : public paddle::framework::OperatorBase {
- public:
-  DEFINE_OP_CLONE_METHOD(OperatorClone);
-  OperatorClone(const std::string& type,
-                const paddle::framework::VariableNameMap& inputs,
-                const paddle::framework::VariableNameMap& outputs,
-                const paddle::framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const paddle::framework::Scope& scope,
-           const paddle::platform::Place& place) const override {}
-};
-
-TEST(Operator, Clone) {
-  paddle::framework::InitDevices();
-  OperatorClone a("ABC", paddle::framework::VariableNameMap{},
-                  paddle::framework::VariableNameMap{},
-                  paddle::framework::AttributeMap{});
-  auto b = a.Clone();
-  ASSERT_EQ(a.Type(), b->Type());
-}
diff --git a/paddle/framework/program_desc.cc b/paddle/framework/program_desc.cc
deleted file mode 100644
index 0e937dda4e185590648962a6d4f827eea21eb620..0000000000000000000000000000000000000000
--- a/paddle/framework/program_desc.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/program_desc.h"
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/feed_fetch_type.h"
-
-namespace paddle {
-namespace framework {
-
-BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
-  auto *b = desc_.add_blocks();
-  b->set_parent_idx(parent.ID());
-  b->set_idx(desc_.blocks_size() - 1);
-  blocks_.emplace_back(new BlockDesc(this, b));
-  return blocks_.back().get();
-}
-
-proto::ProgramDesc *ProgramDesc::Proto() {
-  for (auto &block : blocks_) {
-    block->Flush();
-  }
-  return &desc_;
-}
-
-ProgramDesc::ProgramDesc() {
-  auto *block = desc_.mutable_blocks()->Add();
-  block->set_idx(kRootBlockIndex);
-  block->set_parent_idx(kNoneBlockIndex);
-  blocks_.emplace_back(new BlockDesc(this, block));
-}
-
-ProgramDesc::ProgramDesc(const ProgramDesc &o) {
-  desc_ = o.desc_;
-  for (int i = 0; i < desc_.blocks_size(); ++i) {
-    auto *block = desc_.mutable_blocks(i);
-    blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
-  }
-  for (auto &block : blocks_) {
-    for (auto *op : block->AllOps()) {
-      for (const auto &attr : op->Proto()->attrs()) {
-        if (attr.type() == proto::AttrType::BLOCK) {
-          size_t blk_idx = attr.block_idx();
-          op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx));
-        }
-      }
-    }
-  }
-}
-
-ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
-  desc_ = desc;
-  for (auto &block_desc : *desc_.mutable_blocks()) {
-    blocks_.emplace_back(new BlockDesc(this, &block_desc));
-  }
-  for (auto &block : blocks_) {
-    for (auto *op : block->AllOps()) {
-      for (const auto &attr : op->Proto()->attrs()) {
-        if (attr.type() == proto::AttrType::BLOCK) {
-          size_t blk_idx = attr.block_idx();
-          op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx));
-        }
-      }
-    }
-  }
-}
-
-ProgramDesc::ProgramDesc(const std::string &binary_str) {
-  PADDLE_ENFORCE(desc_.ParseFromString(binary_str),
-                 "Fail to parse program_desc from binary string.");
-  for (auto &block_desc : *desc_.mutable_blocks()) {
-    blocks_.emplace_back(new BlockDesc(this, &block_desc));
-  }
-}
-
-const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
-  BlockDesc *global_block = blocks_[0].get();
-  std::vector<std::string> feed_target_names;
-  for (auto *op : global_block->AllOps()) {
-    if (op->Type() == kFeedOpType) {
-      feed_target_names.insert(feed_target_names.begin(), op->Output("Out")[0]);
-    }
-  }
-  return feed_target_names;
-}
-
-const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
-  BlockDesc *global_block = blocks_[0].get();
-  std::vector<std::string> fetch_target_names;
-  for (auto *op : global_block->AllOps()) {
-    if (op->Type() == kFetchOpType) {
-      fetch_target_names.push_back(op->Input("X")[0]);
-    }
-  }
-  return fetch_target_names;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/program_desc.h b/paddle/framework/program_desc.h
deleted file mode 100644
index 8e958eab6ee08436ca73b13bac010e66c7df2b8b..0000000000000000000000000000000000000000
--- a/paddle/framework/program_desc.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <vector>
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/proto_desc.h"
-#include "paddle/platform/macros.h"
-
-namespace paddle {
-namespace framework {
-
-class BlockDesc;
-
-class ProgramDesc {
- public:
-  ProgramDesc();
-
-  explicit ProgramDesc(const proto::ProgramDesc &desc);
-
-  ProgramDesc(const ProgramDesc &o);
-
-  explicit ProgramDesc(const std::string &binary_str);
-
-  BlockDesc *AppendBlock(const BlockDesc &parent);
-
-  BlockDesc *MutableBlock(size_t idx) { return blocks_[idx].get(); }
-
-  const BlockDesc &Block(size_t idx) const { return *blocks_[idx]; }
-
-  size_t Size() const { return blocks_.size(); }
-
-  proto::ProgramDesc *Proto();
-
-  const std::vector<std::string> GetFeedTargetNames();
-  const std::vector<std::string> GetFetchTargetNames();
-
- private:
-  proto::ProgramDesc desc_;
-
-  std::vector<std::unique_ptr<BlockDesc>> blocks_;
-};
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/program_desc_test.cc b/paddle/framework/program_desc_test.cc
deleted file mode 100644
index 9945aee31b647a6243971c7e64c8391c0b1c09c5..0000000000000000000000000000000000000000
--- a/paddle/framework/program_desc_test.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/program_desc.h"
-#include "gtest/gtest.h"
-#include "paddle/framework/block_desc.h"
-
-namespace paddle {
-namespace framework {
-TEST(ProgramDesc, copy_ctor) {
-  ProgramDesc program;
-  auto* global_block = program.MutableBlock(0);
-  auto* x = global_block->Var("X");
-  x->SetType(proto::VarDesc_VarType_LOD_TENSOR);
-  x->SetLoDLevel(0);
-  x->SetDataType(proto::FP32);
-  x->SetShape({1000, 784});
-
-  auto* y = global_block->Var("Y");
-  y->SetType(proto::VarDesc_VarType_LOD_TENSOR);
-  y->SetLoDLevel(0);
-  y->SetDataType(proto::FP32);
-  y->SetShape({784, 100});
-
-  auto* op = global_block->AppendOp();
-  op->SetType("mul");
-  op->SetInput("X", {x->Name()});
-  op->SetInput("Y", {y->Name()});
-
-  auto* out = global_block->Var("Out");
-  out->SetType(proto::VarDesc_VarType_LOD_TENSOR);
-  op->SetOutput("Y", {out->Name()});
-
-  ProgramDesc program_copy(program);
-
-  auto* global_block_copy = program_copy.MutableBlock(0);
-  ASSERT_NE(global_block, global_block_copy);
-
-  auto assert_same_var = [&](const std::string& name, VarDesc* var_before) {
-    ASSERT_TRUE(global_block_copy->HasVar(name));
-    auto* copy = global_block_copy->Var(name);
-    ASSERT_NE(copy, var_before);
-    ASSERT_EQ(copy->Name(), var_before->Name());
-    ASSERT_EQ(copy->GetType(), var_before->GetType());
-    ASSERT_EQ(copy->GetShape(), var_before->GetShape());
-    ASSERT_EQ(copy->Proto()->SerializeAsString(),
-              var_before->Proto()->SerializeAsString());
-  };
-
-  ASSERT_EQ(global_block->LocalVarNames(), global_block_copy->LocalVarNames());
-  ASSERT_EQ(3UL, global_block_copy->LocalVarNames().size());
-  assert_same_var("X", x);
-  assert_same_var("Y", y);
-  assert_same_var("Out", out);
-
-  for (size_t i = 0; i < global_block->OpSize(); ++i) {
-    auto op_origin = global_block->Op(i);
-    auto op_copy = global_block->Op(i);
-
-    ASSERT_EQ(op_origin->Type(), op_copy->Type());
-    ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs());
-    ASSERT_EQ(op_origin->Outputs(), op_copy->Outputs());
-
-    ASSERT_EQ(op_copy->Proto()->SerializeAsString(),
-              op_origin->Proto()->SerializeAsString());
-  }
-
-  // Not check block's protostr are same it because the order of vars could be
-  // different and it is correct.
-}
-
-TEST(ProgramDescBind, serialize_and_deserialize) {
-  ProgramDesc program_origin;
-  auto* global_block = program_origin.MutableBlock(0);
-  auto* x = global_block->Var("X");
-  x->SetType(proto::VarDesc_VarType_LOD_TENSOR);
-  x->SetLoDLevel(0);
-  x->SetDataType(proto::FP32);
-  x->SetShape({1000, 784});
-
-  auto* y = global_block->Var("Y");
-  y->SetType(proto::VarDesc_VarType_LOD_TENSOR);
-  y->SetLoDLevel(0);
-  y->SetDataType(proto::FP32);
-  y->SetShape({784, 100});
-
-  auto* op = global_block->AppendOp();
-  op->SetType("mul");
-  op->SetInput("X", {x->Name()});
-  op->SetInput("Y", {y->Name()});
-
-  auto* out = global_block->Var("Out");
-  out->SetType(proto::VarDesc_VarType_LOD_TENSOR);
-  op->SetOutput("Y", {out->Name()});
-
-  std::string binary_str;
-  program_origin.Proto()->SerializeToString(&binary_str);
-
-  ProgramDesc program_restored(binary_str);
-  auto* global_block_restored = program_restored.MutableBlock(0);
-  ASSERT_NE(global_block, global_block_restored);
-
-  auto assert_same_var = [&](const std::string& name, VarDesc* var_before) {
-    ASSERT_TRUE(global_block_restored->HasVar(name));
-    auto* restored = global_block_restored->Var(name);
-    ASSERT_NE(restored, var_before);
-    ASSERT_EQ(restored->Name(), var_before->Name());
-    ASSERT_EQ(restored->GetType(), var_before->GetType());
-    ASSERT_EQ(restored->GetShape(), var_before->GetShape());
-    ASSERT_EQ(restored->Proto()->SerializeAsString(),
-              var_before->Proto()->SerializeAsString());
-  };
-
-  ASSERT_EQ(global_block->LocalVarNames(),
-            global_block_restored->LocalVarNames());
-  ASSERT_EQ(3UL, global_block_restored->LocalVarNames().size());
-  assert_same_var("X", x);
-  assert_same_var("Y", y);
-  assert_same_var("Out", out);
-
-  for (size_t i = 0; i < global_block->OpSize(); ++i) {
-    auto op_origin = global_block->Op(i);
-    auto op_restored = global_block->Op(i);
-
-    ASSERT_EQ(op_origin->Type(), op_restored->Type());
-    ASSERT_EQ(op_origin->Inputs(), op_restored->Inputs());
-    ASSERT_EQ(op_origin->Outputs(), op_restored->Outputs());
-
-    ASSERT_EQ(op_restored->Proto()->SerializeAsString(),
-              op_origin->Proto()->SerializeAsString());
-  }
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/prune.cc b/paddle/framework/prune.cc
deleted file mode 100644
index ddd6b993d40f72cba919fad95318f70409c98bca..0000000000000000000000000000000000000000
--- a/paddle/framework/prune.cc
+++ /dev/null
@@ -1,212 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/prune.h"
-
-#include <algorithm>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include <glog/logging.h>
-
-namespace paddle {
-namespace framework {
-
-const std::string kFeedOpType = "feed";
-const std::string kFetchOpType = "fetch";
-const std::string kDropOutOpType = "dropout";
-const std::string kBatchNormOpType = "batch_norm";
-
-bool HasDependentVar(const proto::OpDesc& op_desc,
-                     const std::set<std::string>& dependent_vars) {
-  for (auto& var : op_desc.outputs()) {
-    for (auto& argu : var.arguments()) {
-      if (dependent_vars.count(argu) != 0) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-bool IsTarget(const proto::OpDesc& op_desc) {
-  if (op_desc.has_is_target()) {
-    return op_desc.is_target();
-  }
-  return false;
-}
-
-int GetSubBlockIndex(const proto::OpDesc& op_desc) {
-  for (auto& attr : op_desc.attrs()) {
-    if (attr.type() == proto::AttrType::BLOCK) {
-      PADDLE_ENFORCE(attr.has_block_idx());
-      return attr.block_idx();
-    }
-  }
-  return -1;
-}
-
-bool HasSubBlock(const proto::OpDesc& op_desc) {
-  return GetSubBlockIndex(op_desc) > 0;
-}
-
-// block_id is the idx of the current block in the input desc
-// parent_block_id is the idx of the parent of the current block
-// in the output desc, -1 means the current block is global block
-// dependent_vars is passed recursively from the parent block to
-// the child block to help pruning
-void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
-                int block_id, int parent_block_id,
-                std::set<std::string>& dependent_vars) {
-  auto& block = input.blocks(block_id);
-  auto& ops = block.ops();
-
-  bool expect_feed = true;
-  for (auto& op_desc : ops) {
-    PADDLE_ENFORCE(op_desc.type() != kFeedOpType || expect_feed,
-                   "All FeedOps are at the beginning of the ProgramDesc");
-    expect_feed = (op_desc.type() == kFeedOpType);
-  }
-
-  bool expect_fetch = true;
-  for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
-    auto& op_desc = *op_iter;
-    PADDLE_ENFORCE(op_desc.type() != kFetchOpType || expect_fetch,
-                   "All FetchOps must at the end of the ProgramDesc");
-    expect_fetch = (op_desc.type() == kFetchOpType);
-  }
-
-  std::vector<bool> should_run;
-  for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
-    auto& op_desc = *op_iter;
-    if (IsTarget(op_desc) || HasDependentVar(op_desc, dependent_vars)) {
-      // insert its input to the dependency graph
-      for (auto& var : op_desc.inputs()) {
-        for (auto& argu : var.arguments()) {
-          dependent_vars.insert(argu);
-        }
-      }
-      should_run.push_back(true);
-    } else {
-      should_run.push_back(false);
-    }
-  }
-
-  // since we are traversing the ProgramDesc in reverse order
-  // we reverse the should_run vector
-  std::reverse(should_run.begin(), should_run.end());
-
-  // copy the current block from input to output
-  auto* block_field = output->mutable_blocks();
-  *block_field->Add() = input.blocks(block_id);
-
-  int output_block_id = output->blocks_size() - 1;
-  auto* output_block = output->mutable_blocks(output_block_id);
-  output_block->set_idx(output_block_id);
-  output_block->set_parent_idx(parent_block_id);
-
-  auto* op_field = output_block->mutable_ops();
-  op_field->Clear();
-  for (size_t i = 0; i < should_run.size(); ++i) {
-    if (should_run[i]) {
-      auto* op = op_field->Add();
-      *op = input.blocks(block_id).ops(i);
-      if (HasSubBlock(*op)) {
-        // create sub_block_dependent_vars here to help prune the sub block
-        std::set<std::string> sub_block_dependent_vars;
-        for (auto& var : op->inputs()) {
-          for (auto& argu : var.arguments()) {
-            sub_block_dependent_vars.insert(argu);
-          }
-        }
-        for (auto& var : op->outputs()) {
-          for (auto& argu : var.arguments()) {
-            sub_block_dependent_vars.insert(argu);
-          }
-        }
-        // GetSubBlockIndex(*op) is the idx of the sub_block in the input desc
-        // output_block_id is the idx of the current block in the output desc
-        prune_impl(input, output, GetSubBlockIndex(*op), output_block_id,
-                   sub_block_dependent_vars);
-      }
-    }
-  }
-
-  // remove the VarDescs in BlockDesc that are not referenced in
-  // the pruned OpDescs
-  std::unordered_map<std::string, proto::VarDesc> var_map;
-  auto* var_field = output->mutable_blocks(output_block_id)->mutable_vars();
-  for (const auto& var : *var_field) {
-    var_map[var.name()] = var;
-  }
-
-  std::set<std::string> var_names;
-  for (const auto& op : *op_field) {
-    auto& input_field = op.inputs();
-    for (auto& input_var : input_field) {
-      for (auto& arg : input_var.arguments()) {
-        if (var_map.count(arg) != 0) {
-          var_names.insert(arg);
-        }
-      }
-    }
-    auto& output_field = op.outputs();
-    for (auto& output_var : output_field) {
-      for (auto& arg : output_var.arguments()) {
-        if (var_map.count(arg) != 0) {
-          var_names.insert(arg);
-        }
-      }
-    }
-  }
-
-  var_field->Clear();
-  for (const auto& name : var_names) {
-    *var_field->Add() = var_map[name];
-  }
-}
-
-// TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
-void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) {
-  std::set<std::string> dependent_vars;
-  output->clear_blocks();
-  prune_impl(input, output, 0, -1, dependent_vars);
-}
-
-void inference_optimize_impl(const proto::ProgramDesc& input,
-                             proto::ProgramDesc* output, int block_id) {
-  *output = input;
-  auto* op_field = output->mutable_blocks(block_id)->mutable_ops();
-  for (auto& op_desc : *op_field) {
-    if (op_desc.type() == kDropOutOpType ||
-        op_desc.type() == kBatchNormOpType) {
-      for (auto& attr : *op_desc.mutable_attrs()) {
-        if (attr.name() == "is_test") {
-          attr.set_b(true);
-          break;
-        }
-      }
-    }
-  }
-}
-
-void InferenceOptimize(const proto::ProgramDesc& input,
-                       proto::ProgramDesc* output) {
-  inference_optimize_impl(input, output, 0);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/prune.h b/paddle/framework/prune.h
deleted file mode 100644
index 593292523d0c14136791bb804a4721a0740b47ba..0000000000000000000000000000000000000000
--- a/paddle/framework/prune.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/framework.pb.h"
-#include "paddle/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output);
-
-void InferenceOptimize(const proto::ProgramDesc& input,
-                       proto::ProgramDesc* output);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/prune_test.cc b/paddle/framework/prune_test.cc
deleted file mode 100644
index d76c5abca94cb87220ce73537a8657c3ec695f4d..0000000000000000000000000000000000000000
--- a/paddle/framework/prune_test.cc
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/prune.h"
-
-#include "paddle/framework/attribute.h"
-#include "paddle/framework/operator.h"
-#include "paddle/operators/net_op.h"
-
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/program_desc.h"
-
-#include <gtest/gtest.h>
-
-namespace f = paddle::framework;
-namespace ops = paddle::operators;
-
-void AddOp(const std::string &type, const f::VariableNameMap &inputs,
-           const f::VariableNameMap &outputs, f::AttributeMap attrs,
-           paddle::framework::BlockDesc *block) {
-  // insert output
-  for (auto kv : outputs) {
-    for (auto v : kv.second) {
-      auto var = block->Var(v);
-      var->SetDataType(paddle::framework::proto::DataType::FP32);
-    }
-  }
-
-  // insert op
-  auto op = block->AppendOp();
-  op->SetType(type);
-  for (auto &kv : inputs) {
-    op->SetInput(kv.first, kv.second);
-  }
-  for (auto &kv : outputs) {
-    op->SetOutput(kv.first, kv.second);
-  }
-  op->SetAttrMap(attrs);
-}
-
-TEST(Prune, one_operator) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-
-  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
-        block);
-
-  f::proto::ProgramDesc *pdesc = program.Proto();
-  f::proto::ProgramDesc pruned;
-
-  f::Prune(*pdesc, &pruned);
-  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 0);
-
-  pdesc->mutable_blocks(0)->mutable_ops(0)->set_is_target(true);
-  f::Prune(*pdesc, &pruned);
-  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 1);
-}
-
-TEST(Prune, forward) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-
-  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
-        block);
-  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"c"}}}, f::AttributeMap{},
-        block);
-  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"d"}}}, f::AttributeMap{},
-        block);
-  AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, f::AttributeMap{},
-        block);
-
-  f::proto::ProgramDesc *pdesc = program.Proto();
-
-  for (int i = 0; i < pdesc->blocks(0).ops_size(); ++i) {
-    f::proto::ProgramDesc pruned;
-    pdesc->mutable_blocks(0)->mutable_ops(i)->set_is_target(true);
-    f::Prune(*pdesc, &pruned);
-    PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), i + 1);
-  }
-}
-
-TEST(Prune, multi_input_op) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-
-  AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, f::AttributeMap{},
-        block);
-  AddOp("one_one", {{"input", {"a1"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
-        block);
-  AddOp("one_one", {{"input", {"a2"}}}, {{"output", {"b2"}}}, f::AttributeMap{},
-        block);
-  AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}},
-        f::AttributeMap{}, block);
-
-  f::proto::ProgramDesc *pdesc = program.Proto();
-  pdesc->mutable_blocks(0)->mutable_ops(3)->set_is_target(true);
-
-  f::proto::ProgramDesc pruned;
-  f::Prune(*pdesc, &pruned);
-  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 4);
-}
-
-TEST(Prune, multi_output_op) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-
-  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
-        f::AttributeMap{}, block);
-  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
-        block);
-  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
-        block);
-
-  f::proto::ProgramDesc *pdesc = program.Proto();
-  pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);
-
-  f::proto::ProgramDesc pruned;
-  f::Prune(*pdesc, &pruned);
-  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 2);
-}
-
-TEST(Prune, multi_target) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-
-  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
-        f::AttributeMap{}, block);
-  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
-        block);
-  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
-        block);
-
-  f::proto::ProgramDesc *pdesc = program.Proto();
-  pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
-  pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);
-
-  f::proto::ProgramDesc pruned;
-  f::Prune(*pdesc, &pruned);
-  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 3);
-}
diff --git a/paddle/framework/reader.cc b/paddle/framework/reader.cc
deleted file mode 100644
index 928b661aaadb4a59294de24cc1c414795c2878d5..0000000000000000000000000000000000000000
--- a/paddle/framework/reader.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/framework/reader.h"
-
-namespace paddle {
-namespace framework {
-
-DDim ReaderBase::shape(size_t idx) const {
-  PADDLE_ENFORCE_LT(
-      idx, shapes_.size(),
-      "Cannot get the %d'th shape, 'shapes_' only has %d elements.", idx,
-      shapes_.size());
-  return shapes_[idx];
-}
-
-void ShuffleReader::ReadNext(std::vector<LoDTensor>* out) {
-  if (iteration_pos_ >= buffer_.size()) {
-    // Reload buffer with new data
-    buffer_.clear();
-    buffer_.reserve(buffer_size_);
-    for (int i = 0; i < buffer_size_; ++i) {
-      if (reader_->HasNext()) {
-        buffer_.push_back(std::vector<LoDTensor>());
-        reader_->ReadNext(&buffer_.back());
-      } else {
-        break;
-      }
-    }
-    // TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
-    // optimize.
-    std::random_shuffle(buffer_.begin(), buffer_.end());
-    iteration_pos_ = 0;
-  }
-  out->clear();
-  if (!buffer_.empty()) {
-    std::swap(*out, buffer_[iteration_pos_++]);
-  }
-  // if buffer_ is empty, the 'out' will return as an empty vector.
-}
-
-void BatchReader::ReadNext(std::vector<LoDTensor>* out) {
-  buffer_.clear();
-  buffer_.reserve(batch_size_);
-  for (int i = 0; i < batch_size_; ++i) {
-    if (reader_->HasNext()) {
-      buffer_.push_back(std::vector<LoDTensor>());
-      reader_->ReadNext(&buffer_.back());
-    } else {
-      break;
-    }
-  }
-  // Concat instances
-  out->clear();
-  if (buffer_.empty()) {
-    // if buffer_ is empty, the 'out' will return as an empty vector.
-    return;
-  }
-  int out_num = buffer_[0].size();
-  out->reserve(out_num);
-  for (int j = 0; j < out_num; ++j) {
-    // Merge shape and check date type
-    std::type_index batch_type = buffer_[0][j].type();
-    DDim batch_shape = buffer_[0][j].dims();
-    for (size_t i = 1; i < buffer_.size(); ++i) {
-      std::type_index ins_type = buffer_[i][j].type();
-      DDim ins_shape = buffer_[i][j].dims();
-      PADDLE_ENFORCE_EQ(batch_type, ins_type);
-      PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()),
-                        slice_ddim(ins_shape, 1, ins_shape.size()));
-      PADDLE_ENFORCE_GT(ins_shape[0], 0);
-      batch_shape[0] += ins_shape[0];
-    }
-
-    LoDTensor out_tensor;
-    out_tensor.Resize(batch_shape);
-    out_tensor.mutable_data(platform::CPUPlace(), batch_type);
-    int64_t dst_offset = 0;
-
-    // Merge lod and data
-    LoD batch_lod;
-    std::vector<size_t> top_level_lod({0});
-    for (size_t i = 0; i < buffer_.size(); ++i) {
-      DDim ins_shape = buffer_[i][j].dims();
-      LoD ins_lod = buffer_[i][j].lod();
-      if (i == 0) {
-        batch_lod = ins_lod;
-      } else {
-        PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size());
-        for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) {
-          auto& lod_level = batch_lod[level_idx];
-          for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) {
-            lod_level.push_back(ins_lod[level_idx][k] + lod_level.back());
-          }
-        }
-      }
-      top_level_lod.push_back(
-          top_level_lod.back() +
-          (ins_lod.empty() ? ins_shape[0] : (ins_lod[0].size() - 1)));
-
-      Tensor dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]);
-      Copy(buffer_[i][j], platform::CPUPlace(), &dst);
-      dst_offset += ins_shape[0];
-    }
-    batch_lod.insert(batch_lod.begin(), top_level_lod);
-    out_tensor.set_lod(batch_lod);
-    out->push_back(out_tensor);
-  }
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/reader.h b/paddle/framework/reader.h
deleted file mode 100644
index 534894cfbd66687fc982f1def4cb0d05d77a4583..0000000000000000000000000000000000000000
--- a/paddle/framework/reader.h
+++ /dev/null
@@ -1,161 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/lod_tensor_array.h"
-
-namespace paddle {
-namespace framework {
-
-class ReaderBase {
- public:
-  explicit ReaderBase(const std::vector<DDim>& shapes) : shapes_(shapes) {
-    PADDLE_ENFORCE(!shapes_.empty());
-  }
-  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
-  virtual bool HasNext() const = 0;
-
-  virtual void ReInit() = 0;
-
-  DDim shape(size_t idx) const;
-  std::vector<DDim> shapes() const { return shapes_; }
-  void set_shapes(const std::vector<DDim>& shapes) { shapes_ = shapes; }
-
-  virtual ~ReaderBase() {}
-
- protected:
-  std::vector<DDim> shapes_;
-};
-
-class FileReader : public ReaderBase {
- public:
-  explicit FileReader(const std::vector<DDim>& shapes) : ReaderBase(shapes) {}
-};
-
-class DecoratedReader : public ReaderBase {
- public:
-  explicit DecoratedReader(ReaderBase* reader)
-      : ReaderBase(reader->shapes()), reader_(reader) {
-    PADDLE_ENFORCE_NOT_NULL(reader_);
-  }
-
-  bool HasNext() const override { return reader_->HasNext(); }
-
-  void ReInit() override { reader_->ReInit(); }
-
- protected:
-  ReaderBase* reader_;
-};
-
-// file readers
-
-template <typename T>
-class RandomDataGenerator : public FileReader {
- public:
-  RandomDataGenerator(const std::vector<DDim>& shapes, float min, float max)
-      : FileReader(shapes), min_(min), max_(max) {
-    PADDLE_ENFORCE_LE(
-        min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max);
-    unsigned int seed = std::random_device()();
-    engine_.seed(seed);
-    dist_ = std::uniform_real_distribution<float>(min_, max_);
-  }
-
-  void ReadNext(std::vector<LoDTensor>* out) override {
-    out->clear();
-    out->reserve(shapes_.size());
-    for (const DDim& shape : shapes_) {
-      PADDLE_ENFORCE_GE(
-          shape.size(), 2,
-          "The rank of reader's output data should be 2 at least.(Now it's %d)",
-          shape.size());
-      LoDTensor out_tensor;
-      out_tensor.Resize(shape);
-      T* data = out_tensor.mutable_data<T>(platform::CPUPlace());
-      int64_t numel = product(shape);
-      for (int64_t i = 0; i < numel; ++i) {
-        data[i] = dist_(engine_);
-      }
-      out->push_back(out_tensor);
-    }
-  }
-
-  bool HasNext() const override { return true; }
-
-  void ReInit() override { return; }
-
- private:
-  float min_;
-  float max_;
-  std::minstd_rand engine_;
-  std::uniform_real_distribution<float> dist_;
-};
-
-// decorated readers
-
-class ShuffleReader : public DecoratedReader {
- public:
-  ShuffleReader(ReaderBase* reader, int buffer_size)
-      : DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) {
-    buffer_.reserve(buffer_size);
-  }
-
-  void ReadNext(std::vector<LoDTensor>* out) override;
-
- private:
-  int buffer_size_;
-  std::vector<std::vector<LoDTensor>> buffer_;
-  size_t iteration_pos_;
-};
-
-class BatchReader : public DecoratedReader {
- public:
-  BatchReader(ReaderBase* reader, int batch_size)
-      : DecoratedReader(reader), batch_size_(batch_size) {
-    buffer_.reserve(batch_size_);
-  }
-
-  void ReadNext(std::vector<LoDTensor>* out) override;
-
- private:
-  int batch_size_;
-  std::vector<std::vector<LoDTensor>> buffer_;
-};
-
-// The ReaderHolder is used as readers' unified wrapper,
-// making it easier to access different type readers in Variables.
-class ReaderHolder {
- public:
-  void Reset(ReaderBase* reader) { reader_.reset(reader); }
-
-  ReaderBase* Get() const { return reader_.get(); }
-
-  void ReadNext(std::vector<LoDTensor>* out) { reader_->ReadNext(out); }
-  bool HasNext() const { return reader_->HasNext(); }
-  void ReInit() { reader_->ReInit(); }
-
-  DDim shape(size_t idx) const { return reader_->shape(idx); }
-  std::vector<DDim> shapes() const { return reader_->shapes(); }
-  void set_shapes(const std::vector<DDim>& shapes) {
-    reader_->set_shapes(shapes);
-  }
-
- private:
-  std::unique_ptr<ReaderBase> reader_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
deleted file mode 100644
index af08b2ab816f63c05d4c65df9601c787e57994f5..0000000000000000000000000000000000000000
--- a/paddle/framework/scope.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/scope.h"
-
-#include <memory>  // for unique_ptr
-#include <mutex>   // for call_once
-#include "glog/logging.h"
-#include "paddle/framework/threadpool.h"
-#include "paddle/string/printf.h"
-
-DEFINE_bool(benchmark, false,
-            "Doing memory benchmark. It will make deleting scope synchronized, "
-            "and add some memory usage logs."
-            "Default cuda is asynchronous device, set to True will"
-            "force op run in synchronous mode.");
-
-namespace paddle {
-namespace framework {
-
-Scope::~Scope() {
-  DropKids();
-  for (auto& kv : vars_) {
-    VLOG(3) << "Destroy variable " << kv.first;
-    delete kv.second;
-  }
-}
-
-Scope& Scope::NewScope() const {
-  kids_.push_back(new Scope(this));
-  return *kids_.back();
-}
-
-Variable* Scope::Var(const std::string& name) {
-  auto* v = FindVarLocally(name);
-  if (v != nullptr) return v;
-  v = new Variable();
-  vars_[name] = v;
-  VLOG(3) << "Create variable " << name;
-  v->name_ = &(vars_.find(name)->first);
-  return v;
-}
-
-Variable* Scope::Var(std::string* name) {
-  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
-  if (name != nullptr) {
-    *name = var_name;
-  }
-  return Var(var_name);
-}
-
-Variable* Scope::FindVar(const std::string& name) const {
-  auto var = FindVarLocally(name);
-  if (var != nullptr) {
-    return var;
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
-}
-
-const Scope* Scope::FindScope(const Variable* var) const {
-  for (auto& kv : vars_) {
-    if (kv.second == var) {
-      return this;
-    }
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
-}
-void Scope::DropKids() {
-  for (Scope* s : kids_) delete s;
-  kids_.clear();
-}
-
-std::vector<std::string> Scope::LocalVarNames() const {
-  std::vector<std::string> known_vars;
-  known_vars.reserve(this->vars_.size());
-  for (auto& p : vars_) {
-    known_vars.emplace_back(p.first);
-  }
-  return known_vars;
-}
-
-void Scope::DeleteScope(Scope* scope) {
-  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
-  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
-  this->kids_.erase(it);
-  // When making memory benchmark on Fluid, we have to delete scope sync.
-  if (FLAGS_benchmark) {
-    delete scope;
-  } else {
-    Async([scope] { delete scope; });
-  }
-}
-
-void Scope::Rename(const std::string& origin_name,
-                   const std::string& new_name) const {
-  auto origin_it = vars_.find(origin_name);
-  PADDLE_ENFORCE(origin_it != vars_.end(),
-                 "Cannot find original variable with name %s", origin_name);
-  auto new_it = vars_.find(new_name);
-  PADDLE_ENFORCE(new_it == vars_.end(),
-                 "The variable with name %s is already in the scope", new_name);
-  vars_[new_name] = origin_it->second;
-  vars_.erase(origin_it);
-}
-
-std::string Scope::Rename(const std::string& origin_name) const {
-  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
-  Rename(origin_name, var_name);
-  return var_name;
-}
-
-Variable* Scope::FindVarLocally(const std::string& name) const {
-  auto it = vars_.find(name);
-  if (it != vars_.end()) return it->second;
-  return nullptr;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
deleted file mode 100644
index a1da81cc7977d2f31b99c41fb3db3ec03188f954..0000000000000000000000000000000000000000
--- a/paddle/framework/scope.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <list>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/framework/variable.h"
-#include "paddle/platform/macros.h"
-
-namespace paddle {
-namespace framework {
-
-class Scope;
-
-/**
- * @brief Scope that manage all variables.
- *
- * Scope is an association of a name to Variable. All variables belong to
- * Scope. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`.
- * One net can run in different scopes and update different variable in the
- * scope.
- */
-class Scope {
- public:
-  Scope() {}
-  ~Scope();
-
-  /// Create a sub-scope. Returns a reference other than a pointer so
-  /// to prevent from manual deletion.
-  /// Mark it to const because that new kid scope cannot change parent scope.
-  Scope& NewScope() const;
-
-  /// Create a variable with given name if it doesn't exist.
-  Variable* Var(const std::string& name);
-
-  /// Create a variable with a scope-unique name.
-  Variable* Var(std::string* name = nullptr);
-
-  /// Find a variable in the scope or any of its ancestors.  Returns
-  /// nullptr if cannot find.
-  Variable* FindVar(const std::string& name) const;
-
-  const Scope& parent() const { return *parent_; }
-
-  /// Find the scope or an ancestor scope that contains the given variable.
-  const Scope* FindScope(const Variable* var) const;
-
-  void DeleteScope(Scope* scope);
-
-  /// Drop all kids scopes belonged to this scope.
-  void DropKids();
-
-  // enumerate all the variables current contains.
-  std::vector<std::string> LocalVarNames() const;
-
-  // Rename variable to a new name
-  void Rename(const std::string& origin_name,
-              const std::string& new_name) const;
-
-  // Rename variable to a new name and return the new name
-  std::string Rename(const std::string& origin_name) const;
-
-  Variable* FindVarLocally(const std::string& name) const;
-
- private:
-  // Call Scope::NewScope for a sub-scope.
-  explicit Scope(Scope const* parent) : parent_(parent) {}
-
-  mutable std::unordered_map<std::string, Variable*> vars_;
-  mutable std::list<Scope*> kids_;
-  Scope const* parent_{nullptr};
-
-  DISABLE_COPY_AND_ASSIGN(Scope);
-};
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/scope_test.cc b/paddle/framework/scope_test.cc
deleted file mode 100644
index 0f5b86061dbdebde08badca7f984f4a2c8d7bc79..0000000000000000000000000000000000000000
--- a/paddle/framework/scope_test.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/scope.h"
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-
-using paddle::framework::Scope;
-using paddle::framework::Variable;
-
-TEST(Scope, VarsShadowing) {
-  Scope s;
-  Scope& ss1 = s.NewScope();
-  Scope& ss2 = s.NewScope();
-
-  Variable* v0 = s.Var("a");
-  Variable* v1 = ss1.Var("a");
-
-  EXPECT_NE(v0, v1);
-
-  EXPECT_EQ(v0, s.FindVar("a"));
-  EXPECT_EQ(v1, ss1.FindVar("a"));
-  EXPECT_EQ(v0, ss2.FindVar("a"));
-}
-
-TEST(Scope, FindVar) {
-  Scope s;
-  Scope& ss = s.NewScope();
-
-  EXPECT_EQ(nullptr, s.FindVar("a"));
-  EXPECT_EQ(nullptr, ss.FindVar("a"));
-
-  ss.Var("a");
-
-  EXPECT_EQ(nullptr, s.FindVar("a"));
-  EXPECT_NE(nullptr, ss.FindVar("a"));
-}
-
-TEST(Scope, FindScope) {
-  Scope s;
-  Scope& ss = s.NewScope();
-  Variable* v = s.Var("a");
-
-  EXPECT_EQ(&s, s.FindScope(v));
-  EXPECT_EQ(&s, ss.FindScope(v));
-}
-
-TEST(Scope, GetAllNames) {
-  Scope s;
-  Variable* v = s.Var("a");
-  EXPECT_EQ(&s, s.FindScope(v));
-
-  std::vector<std::string> ans = s.LocalVarNames();
-  std::string str;
-  for (auto& var : ans) {
-    str += var;
-  }
-
-  EXPECT_STREQ("a", str.c_str());
-}
diff --git a/paddle/framework/selected_rows.cc b/paddle/framework/selected_rows.cc
deleted file mode 100644
index 3b3e60177a495cc99f38ee8b82af41c4c76b8652..0000000000000000000000000000000000000000
--- a/paddle/framework/selected_rows.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/selected_rows.h"
-
-namespace paddle {
-namespace framework {
-void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
-                       const platform::DeviceContext& dev_ctx) {
-  {  // the 1st field, uint32_t version
-    constexpr uint32_t version = 0;
-    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
-  }
-  {
-    // the 2st field, rows information
-    auto& rows = selected_rows.rows();
-    uint64_t size = rows.size();
-    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
-    for (uint64_t i = 0; i < size; ++i) {
-      os.write(reinterpret_cast<const char*>(&rows[i]), sizeof(rows[i]));
-    }
-  }
-  {
-    // the 3st field, the height of SelectedRows
-    int64_t height = selected_rows.height();
-    os.write(reinterpret_cast<const char*>(&height), sizeof(height));
-  }
-  // the 4st field, Tensor data
-  SerializeToStream(os, selected_rows.value(), dev_ctx);
-}
-
-void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
-                           const platform::DeviceContext& dev_ctx) {
-  {
-    // the 1st field, unit32_t version for SelectedRows
-    uint32_t version;
-    is.read(reinterpret_cast<char*>(&version), sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-  }
-  {
-    // the 2st field, rows information
-    uint64_t size;
-    is.read(reinterpret_cast<char*>(&size), sizeof(size));
-    auto& rows = *selected_rows->mutable_rows();
-    rows.resize(size);
-    for (uint64_t i = 0; i < size; ++i) {
-      is.read(reinterpret_cast<char*>(&rows[i]), sizeof(int64_t));
-    }
-  }
-  {
-    // the 3st field, the height of the SelectedRows
-    int64_t height;
-    is.read(reinterpret_cast<char*>(&height), sizeof(int64_t));
-    selected_rows->set_height(height);
-  }
-  // the 4st field, tensor which contains the data
-  DeserializeFromStream(is, selected_rows->mutable_value(), dev_ctx);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/selected_rows.h b/paddle/framework/selected_rows.h
deleted file mode 100644
index 30d3dfc1e89f073a8180ceacf77619b36f7079a9..0000000000000000000000000000000000000000
--- a/paddle/framework/selected_rows.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/tensor.h"
-
-namespace paddle {
-namespace framework {
-
-class SelectedRows {
- public:
-  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
-      : rows_(rows), height_(height) {
-    value_.reset(new Tensor());
-  }
-
-  SelectedRows() {
-    height_ = 0;
-    value_.reset(new Tensor());
-  }
-
-  platform::Place place() const { return value_->place(); }
-
-  const Tensor& value() const { return *value_; }
-
-  Tensor* mutable_value() { return value_.get(); }
-
-  int64_t height() const { return height_; }
-
-  void set_height(int64_t height) { height_ = height; }
-
-  const Vector<int64_t>& rows() const { return rows_; }
-
-  Vector<int64_t>* mutable_rows() { return &rows_; }
-
-  void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
-
-  DDim GetCompleteDims() const {
-    std::vector<int64_t> dims = vectorize(value_->dims());
-    dims[0] = height_;
-    return make_ddim(dims);
-  }
-
- private:
-  // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
-  // SelectedRows are simplely concated when adding together. Until a
-  // SelectedRows add a Tensor, will the duplicate rows be handled.
-  Vector<int64_t> rows_;
-  std::unique_ptr<Tensor> value_{nullptr};
-  int64_t height_;
-};
-
-/*
- * Serialize/Desiralize SelectedRows to std::ostream
- * You can pass ofstream or ostringstream to serilize to file
- * or to a in memory string. GPU tensor will be copied to CPU.
- */
-void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
-                       const platform::DeviceContext& dev_ctx);
-void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
-                           const platform::DeviceContext& dev_ctx);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/selected_rows_test.cc b/paddle/framework/selected_rows_test.cc
deleted file mode 100644
index 8ff3fb6a97199a2798ab29c56957a0f77fa26628..0000000000000000000000000000000000000000
--- a/paddle/framework/selected_rows_test.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/selected_rows.h"
-#include "gtest/gtest.h"
-
-namespace paddle {
-namespace framework {
-
-class SelectedRowsTester : public ::testing::Test {
- public:
-  virtual void SetUp() override {
-    std::vector<int64_t> rows{0, 4, 7};
-    int64_t height = 10;
-    int64_t row_numel = 100;
-    selected_rows_.reset(new SelectedRows(rows, height));
-
-    Tensor* value = selected_rows_->mutable_value();
-    value->mutable_data<float>(
-        make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
-  }
-
- protected:
-  platform::CPUPlace place_;
-  std::unique_ptr<SelectedRows> selected_rows_{nullptr};
-};
-
-TEST_F(SelectedRowsTester, height) { ASSERT_EQ(selected_rows_->height(), 10); }
-
-TEST_F(SelectedRowsTester, dims) {
-  ASSERT_EQ(selected_rows_->value().dims(), make_ddim({3, 100}));
-}
-
-TEST_F(SelectedRowsTester, complete_dims) {
-  ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100}));
-}
-
-TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
-  SelectedRows dst_tensor;
-  platform::CPUDeviceContext cpu_ctx(place_);
-  std::ostringstream oss;
-
-  SerializeToStream(oss, *selected_rows_, cpu_ctx);
-
-  std::istringstream iss(oss.str());
-  DeserializeFromStream(iss, &dst_tensor, cpu_ctx);
-
-  ASSERT_EQ(selected_rows_->rows(), dst_tensor.rows());
-  ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
-  ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims());
-  ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc
deleted file mode 100644
index 2f4d45057715d2c6f26bca74d1d691207b528207..0000000000000000000000000000000000000000
--- a/paddle/framework/shape_inference.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/framework/shape_inference.h"
-#include "grad_op_desc_maker.h"
-#include "paddle/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-
-DDim InferShapeContext::GetInputDim(const std::string &name) const {
-  const std::vector<std::string> &arg_names = Inputs(name);
-  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
-                    "Input(%s) should hold one element, but now it holds %d",
-                    name, arg_names.size());
-  return this->GetDim(arg_names[0]);
-}
-
-std::vector<DDim> InferShapeContext::GetInputsDim(
-    const std::string &name) const {
-  const std::vector<std::string> &arg_names = Inputs(name);
-  return GetDims(arg_names);
-}
-
-std::vector<DDim> InferShapeContext::GetReaderDims(
-    const std::string &name) const {
-  const std::vector<std::string> &arg_names = Inputs(name);
-  PADDLE_ENFORCE_EQ(
-      arg_names.size(), 1UL,
-      "Reader input '%s' should hold one element, but now it holds %d", name,
-      arg_names.size());
-  return this->GetRepeatedDims(arg_names[0]);
-}
-
-DDim InferShapeContext::GetInputsElementDim(const std::string &name,
-                                            int idx) const {
-  const std::vector<std::string> &names = Inputs(name);
-  return this->GetDim(names[idx]);
-}
-
-void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) {
-  auto &arg_names = Outputs(name);
-  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
-                    "Output(%s) should hold one element, but now it holds %d",
-                    name, arg_names.size());
-  SetDim(arg_names[0], dim);
-}
-
-void InferShapeContext::SetOutputsDim(const std::string &name,
-                                      const std::vector<DDim> &dims) {
-  auto &names = Outputs(name);
-  SetDims(names, dims);
-}
-
-void InferShapeContext::SetReaderDims(const std::string &name,
-                                      const std::vector<DDim> &dims) {
-  const std::vector<std::string> &arg_names = Outputs(name);
-  PADDLE_ENFORCE_EQ(
-      arg_names.size(), 1UL,
-      "Reader output '%s' should hold one element, but now it holds %d", name,
-      arg_names.size());
-  return this->SetRepeatedDims(arg_names[0], dims);
-}
-
-std::vector<DDim> InferShapeContext::GetDims(
-    const std::vector<std::string> &names) const {
-  std::vector<DDim> ret;
-  ret.reserve(names.size());
-  std::transform(
-      names.begin(), names.end(), std::back_inserter(ret),
-      [this](const std::string &name) { return this->GetDim(name); });
-  return ret;
-}
-
-void InferShapeContext::SetDims(const std::vector<std::string> &names,
-                                const std::vector<DDim> &dims) {
-  size_t length = names.size();
-  PADDLE_ENFORCE_EQ(length, dims.size());
-  for (size_t i = 0; i < length; ++i) {
-    if (names[i] == framework::kEmptyVarName) {
-      continue;
-    }
-    SetDim(names[i], dims[i]);
-  }
-}
-
-std::vector<proto::VarDesc::VarType> InferShapeContext::GetInputsVarType(
-    const std::string &name) const {
-  return GetVarTypes(Inputs(name));
-}
-
-std::vector<proto::VarDesc::VarType> InferShapeContext::GetOutputsVarType(
-    const std::string &name) const {
-  return GetVarTypes(Outputs(name));
-}
-
-std::vector<proto::VarDesc::VarType> InferShapeContext::GetVarTypes(
-    const std::vector<std::string> &names) const {
-  std::vector<proto::VarDesc::VarType> retv;
-  retv.resize(names.size());
-  std::transform(names.begin(), names.end(), retv.begin(),
-                 std::bind(std::mem_fn(&InferShapeContext::GetVarType), this,
-                           std::placeholders::_1));
-  return retv;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
deleted file mode 100644
index 7bee86985239de73fca9aef1faefc04f7615f3ce..0000000000000000000000000000000000000000
--- a/paddle/framework/shape_inference.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/attribute.h"
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/framework.pb.h"
-
-namespace paddle {
-namespace framework {
-
-class InferShapeContext {
- public:
-  virtual ~InferShapeContext() = default;
-  virtual bool HasInput(const std::string &name) const = 0;
-  virtual bool HasOutput(const std::string &name) const = 0;
-
-  std::vector<proto::VarDesc::VarType> GetInputsVarType(
-      const std::string &name) const;
-  std::vector<proto::VarDesc::VarType> GetOutputsVarType(
-      const std::string &name) const;
-
-  virtual bool HasInputs(const std::string &name) const = 0;
-  virtual bool HasOutputs(const std::string &name) const = 0;
-
-  DDim GetInputDim(const std::string &name) const;
-  std::vector<DDim> GetInputsDim(const std::string &name) const;
-  std::vector<DDim> GetReaderDims(const std::string &name) const;
-  DDim GetInputsElementDim(const std::string &name, int idx) const;
-
-  void SetOutputDim(const std::string &name, const DDim &dim);
-  void SetOutputsDim(const std::string &name, const std::vector<DDim> &dims);
-  void SetReaderDims(const std::string &name, const std::vector<DDim> &dims);
-
-  virtual AttrReader Attrs() const = 0;
-  virtual const std::vector<std::string> &Inputs(
-      const std::string &name) const = 0;
-  virtual const std::vector<std::string> &Outputs(
-      const std::string &name) const = 0;
-
-  virtual void ShareLoD(const std::string &in, const std::string &out,
-                        size_t i = 0, size_t j = 0) const = 0;
-
-  virtual bool IsRuntime() const = 0;
-
-  // Note: In while op, we need this to be public
-  void SetDims(const std::vector<std::string> &names,
-               const std::vector<DDim> &dims);
-
- protected:
-  virtual DDim GetDim(const std::string &name) const = 0;
-  virtual void SetDim(const std::string &name, const DDim &dim) = 0;
-  virtual std::vector<DDim> GetRepeatedDims(const std::string &name) const = 0;
-  virtual void SetRepeatedDims(const std::string &name,
-                               const std::vector<DDim> &dims) = 0;
-
-  std::vector<DDim> GetDims(const std::vector<std::string> &names) const;
-  std::vector<proto::VarDesc::VarType> GetVarTypes(
-      const std::vector<std::string> &names) const;
-
-  virtual proto::VarDesc::VarType GetVarType(const std::string &name) const = 0;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/tensor.cc b/paddle/framework/tensor.cc
deleted file mode 100644
index f922e606249849e621e679f71d6dbe0f007c8464..0000000000000000000000000000000000000000
--- a/paddle/framework/tensor.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/tensor.h"
-
-namespace paddle {
-namespace framework {}
-}  // namespace paddle
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
deleted file mode 100644
index f0ea709a5c37e769e3ffa1b2e9d1e39721979251..0000000000000000000000000000000000000000
--- a/paddle/framework/tensor.h
+++ /dev/null
@@ -1,226 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <typeindex>
-#include <vector>
-
-#include "paddle/framework/data_layout.h"
-#include "paddle/framework/ddim.h"
-#include "paddle/memory/memory.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/place.h"
-
-namespace paddle {
-
-namespace framework {
-
-class LoDTensor;
-
-class Tensor {
- public:
-  template <typename T, size_t D, int MajorType, typename IndexType>
-  friend struct EigenTensor;
-
-  template <typename T, int MajorType, typename IndexType>
-  friend struct EigenMatrix;
-
-  template <typename T, int MajorType, typename IndexType>
-  friend struct EigenVector;
-
- public:
-  Tensor() : offset_(0) {}
-
-  /*! Constructor with place should only be used in pybind. */
-  explicit Tensor(const platform::Place& place) : offset_(0) {
-    holder_->set_place(place);
-  }
-
-  /*! Return a pointer to mutable memory block. */
-  template <typename T>
-  inline T* data();
-
-  /*! Return a pointer to constant memory block. */
-  template <typename T>
-  inline const T* data() const;
-
-  inline bool IsInitialized() const;
-
-  inline void switch_place(platform::Place new_place);
-
-  /**
-   * @brief   Return a pointer to mutable memory block.
-   * @note    If not exist, then allocation.
-   */
-  template <typename T>
-  inline T* mutable_data(platform::Place place);
-
-  inline void* mutable_data(platform::Place place, std::type_index type);
-
-  inline void* mutable_data(platform::Place place);
-
-  /**
-   * @brief     Return a pointer to mutable memory block.
-   *
-   * @param[in] dims    The dimensions of the memory block.
-   * @param[in] place   The place of the memory block.
-   *
-   * @note      If not exist, then allocation.
-   */
-  template <typename T>
-  inline T* mutable_data(DDim dims, platform::Place place);
-
-  /*! Return the dimensions of the memory block. */
-  inline const DDim& dims() const;
-
-  /*! Return the numel of the memory block. */
-  inline int64_t numel() const;
-
-  /*! Resize the dimensions of the memory block. */
-  inline Tensor& Resize(const DDim& dims);
-
-  /*! The internal of two tensors share the same memory block. */
-  inline Tensor& ShareDataWith(const Tensor& src);
-
-  /**
-   * @brief  Return a sub-tensor of the given tensor.
-   *
-   * @param[in] begin_idx   The index of the start row(inclusive) to slice.
-   *                        The index number begins from 0.
-   * @param[in] end_idx     The index of the end row(exclusive) to slice.
-   *                        The index number begins from 0.
-   */
-  inline Tensor Slice(int begin_idx, int end_idx) const;
-
-  platform::Place place() const {
-    PADDLE_ENFORCE_NOT_NULL(
-        holder_, "Tensor not initialized yet when Tensor::place() is called.");
-    return holder_->place();
-  }
-
-  std::type_index type() const {
-    PADDLE_ENFORCE_NOT_NULL(
-        holder_, "Tensor not initialized yet when Tensor::type() is called.");
-    return holder_->type();
-  }
-
-  size_t memory_size() const;
-
-  inline void check_memory_size() const;
-
-  inline DataLayout layout() const { return layout_; }
-
-  inline void set_layout(const DataLayout layout) { layout_ = layout; }
-
- private:
-  friend class LoDTensor;
-
-  /**
-   * @note    Placeholder hides type T, so it doesn't appear as a template
-   *          parameter of Variable.
-   */
-  struct Placeholder {
-    virtual ~Placeholder() = default;
-    virtual void* ptr() const = 0;
-    virtual size_t size() const = 0;
-    virtual std::type_index type() const = 0;
-    virtual platform::Place place() const = 0;
-    virtual void set_type(std::type_index type) = 0;
-    virtual void set_place(platform::Place place) = 0;
-  };
-
-  template <typename Place>
-  struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(Place place, size_t size, std::type_index type)
-        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
-               memory::PODDeleter<uint8_t, Place>(place)),
-          place_(place),
-          size_(size),
-          type_(type) {
-      PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
-                              (is_cpu_place(place_) ? "CPU" : "GPU"));
-    }
-
-    virtual size_t size() const { return size_; }
-    virtual platform::Place place() const { return place_; }
-    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
-    virtual std::type_index type() const { return type_; }
-    virtual void set_type(std::type_index type) { type_ = type; }
-    virtual void set_place(platform::Place place) { place_ = place; }
-
-    /*! the pointer of memory block. */
-    std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
-
-    /*! the place of memory block. */
-    platform::Place place_;
-
-    /*! the size of memory block. */
-    size_t size_;
-
-    /* the current type of memory */
-    std::type_index type_;
-  };
-
-  /*! holds the memory block if allocated. */
-  std::shared_ptr<Placeholder> holder_;
-
-  /**
-   * @brief points to elements dimensions.
-   *
-   * @note dims_ do not indicate the memory block size.
-   */
-
-  DDim dims_;
-
-  /**
-   * @brief the layout of memory block, default is NHWC.
-   *
-   * @note the memory allocation order, describe how weight/data is stored
-   *       For example, in 4-D Tensor(rank=4), there are three commonly
-   *       used layout. They are
-   *            NCHW, NHWC, CHWN.
-   *       N,C,H,W for respectively the batch size, the number of
-   *       feature maps, the height.
-   */
-
-  DataLayout layout_ = DataLayout::kNHWC;
-
-  /**
-   * @brief   A PlaceHolder may be shared by more than one tensor.
-   *
-   * @note    Some of them may be slices of the others. So the offset_
-   *          is introduced here to indicate the byte offset between
-   *          PlaceHolder::ptr_ and where the tensor data really begins.
-   */
-  size_t offset_;
-};
-
-inline void Tensor::switch_place(platform::Place new_place) {
-  if (holder_->place() == new_place) {
-    return;
-  }
-
-  // TODO(tonyyang-svail): do memcpy here.
-  PADDLE_THROW("Not Implemented");
-}
-
-}  // namespace framework
-}  // namespace paddle
-
-#include "paddle/framework/tensor_impl.h"
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
deleted file mode 100644
index 1340c5e48520ccdd537e694abf452fd79129df99..0000000000000000000000000000000000000000
--- a/paddle/framework/tensor_impl.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/memory/memcpy.h"
-#include "paddle/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-template <typename... T>
-struct SizeOfTypeFunctor;
-
-template <typename T>
-struct SizeOfTypeFunctor<T> {
-  size_t operator()(std::type_index type) const {
-    if (typeid(T).hash_code() == type.hash_code()) {
-      return sizeof(T);
-    } else {
-      return 0UL;
-    }
-  }
-};
-
-template <>
-struct SizeOfTypeFunctor<> {
-  size_t operator()(std::type_index type) const { return 0UL; }
-};
-
-template <typename HEAD, typename... TAIL>
-struct SizeOfTypeFunctor<HEAD, TAIL...> {
-  size_t operator()(std::type_index type) const {
-    SizeOfTypeFunctor<HEAD> head;
-    size_t head_size = head(type);
-    if (head_size != 0) {
-      return head_size;
-    }
-    SizeOfTypeFunctor<TAIL...> tail;
-    return tail(type);
-  }
-};
-
-static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool> functor;
-  size_t size = functor(type);
-  PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
-  return size;
-}
-
-inline void Tensor::check_memory_size() const {
-  PADDLE_ENFORCE_NOT_NULL(
-      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
-  PADDLE_ENFORCE_GE(
-      holder_->size(), memory_size() + offset_,
-      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
-      "first to re-allocate memory.\n"
-      "or maybe the required data-type mismatches the data already stored.");
-}
-
-inline size_t Tensor::memory_size() const {
-  return holder_ == nullptr ? 0UL : numel() * SizeOfType(type());
-}
-
-template <typename T>
-inline const T* Tensor::data() const {
-  check_memory_size();
-  PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                     holder_->type().hash_code() == typeid(T).hash_code(),
-                 "Tensor holds the wrong type, it holds %s",
-                 this->holder_->type().name());
-
-  return reinterpret_cast<const T*>(
-      reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-}
-
-inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
-
-template <typename T>
-inline T* Tensor::data() {
-  check_memory_size();
-  PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                     holder_->type().hash_code() == typeid(T).hash_code(),
-                 "Tensor holds the wrong type, it holds %s",
-                 this->holder_->type().name());
-  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                              offset_);
-}
-
-template <typename T>
-inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
-  static_assert(std::is_pod<T>::value, "T must be POD");
-  Resize(dims);
-  return mutable_data<T>(place);
-}
-
-template <typename T>
-inline T* Tensor::mutable_data(platform::Place place) {
-  static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
-}
-
-inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
-  if (holder_ != nullptr) {
-    holder_->set_type(type);
-  }
-  PADDLE_ENFORCE_GT(
-      numel(), 0,
-      "When calling this method, the Tensor's numel must be larger than zero. "
-      "Please check Tensor::Resize has been called first.");
-  int64_t size = numel() * SizeOfType(type);
-  /* some versions of boost::variant don't have operator!= */
-  if (holder_ == nullptr || !(holder_->place() == place) ||
-      holder_->size() < size + offset_) {
-    if (platform::is_cpu_place(place)) {
-      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), size, type));
-    } else if (platform::is_gpu_place(place)) {
-#ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
-    }
-#else
-      holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
-          boost::get<platform::CUDAPlace>(place), size, type));
-    }
-#endif
-    offset_ = 0;
-  }
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-}
-
-inline void* Tensor::mutable_data(platform::Place place) {
-  PADDLE_ENFORCE(this->holder_ != nullptr,
-                 "Cannot invoke mutable data if current hold nothing");
-  return mutable_data(place, holder_->type());
-}
-
-inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
-  src.check_memory_size();
-  *this = src;
-  return *this;
-}
-
-inline Tensor Tensor::Slice(int begin_idx, int end_idx) const {
-  check_memory_size();
-  PADDLE_ENFORCE_GE(begin_idx, 0,
-                    "The start row index must be greater than 0.");
-  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
-  PADDLE_ENFORCE_LT(
-      begin_idx, end_idx,
-      "The start row index must be lesser than the end row index.");
-
-  if (dims_[0] == 1) {
-    return *this;
-  } else {
-    size_t base = numel() / dims_[0];
-    Tensor dst;
-    dst.holder_ = holder_;
-    dst.set_layout(layout_);
-    DDim dst_dims = dims_;
-    dst_dims[0] = end_idx - begin_idx;
-    dst.Resize(dst_dims);
-    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
-    return dst;
-  }
-}
-
-inline Tensor& Tensor::Resize(const DDim& dims) {
-  dims_ = dims;
-  return *this;
-}
-
-inline const DDim& Tensor::dims() const { return dims_; }
-
-inline int64_t Tensor::numel() const { return product(dims_); }
-
-inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
-  Tensor res;
-  res.ShareDataWith(src);
-  res.Resize(flatten_to_2d(src.dims(), num_col_dims));
-  return res;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
deleted file mode 100644
index 9a387526ac2b0982aa4c931a9d92d98b08fb4f98..0000000000000000000000000000000000000000
--- a/paddle/framework/tensor_test.cc
+++ /dev/null
@@ -1,215 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/framework/tensor.h"
-#include <gtest/gtest.h>
-#include <string>
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-
-TEST(Tensor, Dims) {
-  framework::Tensor tt;
-  tt.Resize({2, 3, 4});
-  framework::DDim dims = tt.dims();
-  ASSERT_EQ(arity(dims), 3);
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(i + 2, dims[i]);
-  }
-}
-
-TEST(Tensor, DataAssert) {
-  framework::Tensor src_tensor;
-
-  bool caught = false;
-  try {
-    src_tensor.data<double>();
-  } catch (platform::EnforceNotMet err) {
-    caught = true;
-    std::string msg =
-        "holder_ should not be null\nTensor holds no memory. Call "
-        "Tensor::mutable_data first.";
-    const char* what = err.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
-  }
-  ASSERT_TRUE(caught);
-}
-
-TEST(Tensor, MutableData) {
-  {
-    framework::Tensor src_tensor;
-    float* p1 = nullptr;
-    float* p2 = nullptr;
-    // initialization
-    p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
-                                        platform::CPUPlace());
-    EXPECT_NE(p1, nullptr);
-    // set src_tensor a new dim with large size
-    // momery is supposed to be re-allocated
-    p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 4}),
-                                        platform::CPUPlace());
-    EXPECT_NE(p2, nullptr);
-    EXPECT_NE(p1, p2);
-    // set src_tensor a new dim with same size
-    // momery block is supposed to be unchanged
-    p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
-                                        platform::CPUPlace());
-    EXPECT_EQ(p1, p2);
-    // set src_tensor a new dim with smaller size
-    // momery block is supposed to be unchanged
-    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
-                                        platform::CPUPlace());
-    EXPECT_EQ(p1, p2);
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    framework::Tensor src_tensor;
-    float* p1 = nullptr;
-    float* p2 = nullptr;
-    // initialization
-    p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
-                                        platform::CUDAPlace());
-    EXPECT_NE(p1, nullptr);
-    // set src_tensor a new dim with large size
-    // momery is supposed to be re-allocated
-    p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 4}),
-                                        platform::CUDAPlace());
-    EXPECT_NE(p2, nullptr);
-    EXPECT_NE(p1, p2);
-    // set src_tensor a new dim with same size
-    // momery block is supposed to be unchanged
-    p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
-                                        platform::CUDAPlace());
-    EXPECT_EQ(p1, p2);
-    // set src_tensor a new dim with smaller size
-    // momery block is supposed to be unchanged
-    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
-                                        platform::CUDAPlace());
-    EXPECT_EQ(p1, p2);
-  }
-#endif
-}
-
-TEST(Tensor, ShareDataWith) {
-  {
-    framework::Tensor src_tensor;
-    framework::Tensor dst_tensor;
-    // Try to share data form uninitialized tensor
-    bool caught = false;
-    try {
-      dst_tensor.ShareDataWith(src_tensor);
-    } catch (paddle::platform::EnforceNotMet err) {
-      caught = true;
-      std::string msg =
-          "holder_ should not be null\nTensor holds no memory. Call "
-          "Tensor::mutable_data first.";
-      const char* what = err.what();
-      for (size_t i = 0; i < msg.length(); ++i) {
-        ASSERT_EQ(what[i], msg[i]);
-      }
-    }
-    ASSERT_TRUE(caught);
-
-    src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
-                                 platform::CPUPlace());
-    dst_tensor.ShareDataWith(src_tensor);
-    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    framework::Tensor src_tensor;
-    framework::Tensor dst_tensor;
-    src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
-                                 platform::CUDAPlace());
-    dst_tensor.ShareDataWith(src_tensor);
-    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
-  }
-#endif
-}
-
-TEST(Tensor, Slice) {
-  {
-    framework::Tensor src_tensor;
-    src_tensor.mutable_data<int>(framework::make_ddim({5, 3, 4}),
-                                 platform::CPUPlace());
-    framework::Tensor slice_tensor = src_tensor.Slice(1, 3);
-    framework::DDim slice_dims = slice_tensor.dims();
-    ASSERT_EQ(arity(slice_dims), 3);
-    EXPECT_EQ(slice_dims[0], 2);
-    EXPECT_EQ(slice_dims[1], 3);
-    EXPECT_EQ(slice_dims[2], 4);
-
-    uintptr_t src_data_address =
-        reinterpret_cast<uintptr_t>(src_tensor.data<int>());
-    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
-        src_tensor.mutable_data<int>(src_tensor.dims(), platform::CPUPlace()));
-    uintptr_t slice_data_address =
-        reinterpret_cast<uintptr_t>(slice_tensor.data<int>());
-    uintptr_t slice_mutable_data_address =
-        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<int>(
-            slice_tensor.dims(), platform::CPUPlace()));
-    EXPECT_EQ(src_data_address, src_mutable_data_address);
-    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
-    EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    framework::Tensor src_tensor;
-    src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
-                                    platform::CUDAPlace());
-    framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
-    framework::DDim slice_dims = slice_tensor.dims();
-    ASSERT_EQ(arity(slice_dims), 2);
-    EXPECT_EQ(slice_dims[0], 4);
-    EXPECT_EQ(slice_dims[1], 9);
-
-    uintptr_t src_data_address =
-        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
-    uintptr_t src_mutable_data_address =
-        reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
-            src_tensor.dims(), platform::CUDAPlace()));
-    uintptr_t slice_data_address =
-        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
-    uintptr_t slice_mutable_data_address =
-        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
-            slice_tensor.dims(), platform::CUDAPlace()));
-    EXPECT_EQ(src_data_address, src_mutable_data_address);
-    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
-    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
-  }
-#endif
-}
-
-TEST(Tensor, ReshapeToMatrix) {
-  framework::Tensor src;
-  int* src_ptr = src.mutable_data<int>({2, 3, 4, 9}, platform::CPUPlace());
-  for (int i = 0; i < 2 * 3 * 4 * 9; ++i) {
-    src_ptr[i] = i;
-  }
-  framework::Tensor res = framework::ReshapeToMatrix(src, 2);
-  ASSERT_EQ(res.dims()[0], 2 * 3);
-  ASSERT_EQ(res.dims()[1], 4 * 9);
-}
-
-TEST(Tensor, Layout) {
-  framework::Tensor src;
-  ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC);
-  src.set_layout(framework::DataLayout::kAnyLayout);
-  ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
-}
diff --git a/paddle/framework/tensor_util.cc b/paddle/framework/tensor_util.cc
deleted file mode 100644
index a5b83eaa07ad25d39996f5644d6a7f3ed35ff7b2..0000000000000000000000000000000000000000
--- a/paddle/framework/tensor_util.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/framework/tensor_util.h"
-
-namespace paddle {
-namespace framework {
-template <typename Predicate, typename DevCtx>
-struct AnyDTypeVisitor {
-  Predicate predicate_;
-  const Tensor& tensor_;
-  const DevCtx& ctx_;
-  Tensor* out_;
-
-  AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx,
-                  Tensor* out)
-      : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
-
-  template <typename T>
-  void operator()() const {
-    auto t = EigenVector<T>::Flatten(tensor_);
-    auto o = EigenScalar<bool>::From(*out_);
-    // return any of predicate_(t) is true.
-    o.device(*ctx_.eigen_device()) = predicate_(t).any();
-  }
-};
-
-template <typename Predicate, typename DevCtx>
-inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
-                    const DevCtx& ctx, framework::Tensor* out) {
-  VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor<Predicate, DevCtx>(
-                                               predicate, tensor, ctx, out));
-}
-
-template <typename Predicate>
-struct AnyVisitor : public boost::static_visitor<bool> {
-  const framework::Tensor& tensor_;
-  Predicate predicate_;
-
-  AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
-      : tensor_(tensor), predicate_(std::move(predicate)) {}
-
-  template <typename Place>
-  bool operator()(const Place& place) const {
-    framework::Tensor out;
-    out.Resize({1});
-    out.mutable_data<bool>(place);
-    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
-    AnyImpl(predicate_, tensor_, *ctx, &out);
-    return this->GetResult(out, place);
-  }
-
-  bool GetResult(const framework::Tensor& out,
-                 const platform::CUDAPlace& gpu) const {
-    platform::CPUPlace cpu;
-    framework::Tensor tmp;
-    tmp.Resize({1});
-    tmp.mutable_data<bool>(cpu);
-    auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
-    gpuctx->Wait();
-    Copy(out, cpu, *gpuctx, &tmp);
-    gpuctx->Wait();
-    return GetResult(tmp, cpu);
-  }
-
-  bool GetResult(const framework::Tensor& out,
-                 const platform::CPUPlace& cpu) const {
-    return *out.data<bool>();
-  }
-};
-
-template <typename Predicate>
-inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
-  AnyVisitor<Predicate> visitor(tensor, predicate);
-  auto place = tensor.place();
-  return platform::VisitPlace(place, visitor);
-}
-
-struct HasNANPredicate {
-  template <typename T>
-  auto operator()(const T& eigen_vec) const
-      -> decltype(std::declval<T>().isnan()) {
-    // Cast eigen_vector to vector of bool. true if is inf.
-    return eigen_vec.isnan();
-  }
-};
-
-bool HasNAN(const framework::Tensor& tensor) {
-  HasNANPredicate predicate;
-  return Any(tensor, predicate);
-}
-
-struct HasInfPredicate {
-  template <typename T>
-  auto operator()(const T& eigen_vec) const
-      -> decltype(std::declval<T>().isinf()) {
-    // Cast eigen_vector to vector of bool. true if is inf.
-    return eigen_vec.isinf();
-  }
-};
-
-bool HasInf(const framework::Tensor& tensor) {
-  HasInfPredicate predicate;
-  return Any(tensor, predicate);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/tensor_util.cu b/paddle/framework/tensor_util.cu
deleted file mode 120000
index b00e6e59d93328bf3142597ea4de0dc225501e56..0000000000000000000000000000000000000000
--- a/paddle/framework/tensor_util.cu
+++ /dev/null
@@ -1 +0,0 @@
-./tensor_util.cc
\ No newline at end of file
diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h
deleted file mode 100644
index b49c61449984f51d65963958c87191b0799bcf5b..0000000000000000000000000000000000000000
--- a/paddle/framework/tensor_util.h
+++ /dev/null
@@ -1,333 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-
-/**
- * @brief   Copy the content of external tensor to a new place.
- *
- * @param[in] src        The external tensor.
- * @param[in] dst_place  The dst place.
- * @param[in] ctx        The device context contains device resources.
- *
- * @note    Copy supports CPU <-> GPU, GPU <-> GPU.
- */
-inline void Copy(const Tensor& src, const platform::Place& dst_place,
-                 const platform::DeviceContext& ctx, Tensor* dst) {
-  VLOG(3) << "Copy " << src.dims() << " from " << src.place() << " to "
-          << dst_place;
-  src.check_memory_size();
-
-  dst->Resize(src.dims());
-  dst->set_layout(src.layout());
-  auto src_place = src.place();
-  auto src_ptr = src.data<void>();
-
-  auto dst_ptr = dst->mutable_data(dst_place, src.type());
-
-  auto size = src.numel() * SizeOfType(src.type());
-
-  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
-                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
-  }
-#ifdef PADDLE_WITH_CUDA
-  else if (platform::is_gpu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
-    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  } else if (platform::is_gpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  }
-#endif
-}
-
-/**
- * @brief Wrapper on
- *     Copy(const Tensor& src, const platform::Place& dst_place,
- *              const platform::DeviceContext& ctx, Tensor* dst);
- *
- * @param[in] src        The external tensor.
- * @param[in] dst_place  The dst place.
- *
- * @note    Copy supports CPU <-> GPU, GPU <-> GPU.
- */
-inline void Copy(const Tensor& src, const platform::Place& dst_place,
-                 Tensor* dst) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  const platform::DeviceContext* dev_ctx;
-  if (platform::is_gpu_place(src.place())) {
-    dev_ctx = pool.Get(src.place());
-  } else {
-    dev_ctx = pool.Get(dst_place);
-  }
-  Copy(src, dst_place, *dev_ctx, dst);
-}
-
-/**
- * @brief   Copy the content of an external vector to a tensor.
- *
- * @param[in] src        The external tensor.
- * @param[in] ctx        The device context contains device resources.
- *
- * * @note    CopyFromVector will resize dst to an 1D tensor with the same
- *            size as src.
- */
-template <typename T>
-inline void CopyFromVector(const std::vector<T>& src,
-                           const platform::DeviceContext& ctx, Tensor* dst) {
-  auto dst_place = ctx.GetPlace();
-  auto src_ptr = static_cast<const void*>(src.data());
-  platform::CPUPlace src_place;
-  dst->Resize({static_cast<int64_t>(src.size())});
-  auto dst_ptr = static_cast<void*>(dst->mutable_data<T>(dst_place));
-  auto size = src.size() * sizeof(T);
-
-  if (platform::is_cpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, src_place,
-                 src_ptr, size);
-  }
-#ifdef PADDLE_WITH_CUDA
-  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
-    memory::Copy(
-        boost::get<platform::CUDAPlace>(dst_place), dst_ptr, src_place, src_ptr,
-        size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  }
-#endif
-}
-
-/**
- * @brief CopyFromVector CPU vector -> CPU Tensor
- */
-template <typename T>
-inline void CopyFromVector(const std::vector<T>& src, Tensor* dst) {
-  platform::CPUPlace dst_place = platform::CPUPlace();
-  auto src_ptr = static_cast<const void*>(src.data());
-  platform::CPUPlace src_place;
-  dst->Resize({static_cast<int64_t>(src.size())});
-  auto dst_ptr = static_cast<void*>(dst->mutable_data<T>(dst_place));
-  auto size = src.size() * sizeof(T);
-
-  memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-}
-
-/**
- * @brief   Copy the content of a tensor to a vector
- *
- * @param[in] src        The external tensor.
- * @param[in] ctx        The device context contains device resources.
- *
- * * @note    CopyFromVector assumes that the tensor has been resized
- *            before invoking.
- */
-template <typename T>
-inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
-                         std::vector<T>* dst) {
-  auto src_ptr = static_cast<const void*>(src.data<T>());
-  auto size = src.numel() * sizeof(T);
-
-  platform::CPUPlace dst_place;
-  dst->resize(src.numel());
-  auto dst_ptr = static_cast<void*>(dst->data());
-
-  if (platform::is_cpu_place(src.place())) {
-    memory::Copy(dst_place, dst_ptr,
-                 boost::get<platform::CPUPlace>(src.place()), src_ptr, size);
-  }
-#ifdef PADDLE_WITH_CUDA
-  else if (platform::is_gpu_place(src.place())) {  // NOLINT
-    memory::Copy(
-        dst_place, dst_ptr, boost::get<platform::CUDAPlace>(src.place()),
-        src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  }
-#endif
-}
-
-/**
- * @brief CopyToVector CPUTensor <-> CPU Vector
- */
-template <typename T>
-inline void CopyToVector(const Tensor& src, std::vector<T>* dst) {
-  auto src_ptr = static_cast<const void*>(src.data<T>());
-  auto size = src.numel() * sizeof(T);
-
-  platform::CPUPlace dst_place;
-  dst->resize(src.numel());
-  auto dst_ptr = static_cast<void*>(dst->data());
-
-  PADDLE_ENFORCE(platform::is_cpu_place(src.place()));
-
-  memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()),
-               src_ptr, size);
-}
-
-// Returns true if a tensor contains NAN, i.e., Not A Number.
-bool HasNAN(const framework::Tensor& tensor);
-
-// Returns true if a tensor contains Inf, i.e., Infinity.
-bool HasInf(const framework::Tensor& tensor);
-
-inline void SerializeToStream(std::ostream& os, const Tensor& tensor,
-                              const platform::DeviceContext& dev_ctx) {
-  // TODO(typhoonzero): serialize to ostream
-  {  // the 1st field, uint32_t version
-    constexpr uint32_t version = 0;
-    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
-  }
-  {  // the 2nd field, tensor description
-     // int32_t  size
-     // void*    protobuf message
-    proto::TensorDesc desc;
-    desc.set_data_type(framework::ToDataType(tensor.type()));
-    auto dims = framework::vectorize(tensor.dims());
-    auto* pb_dims = desc.mutable_dims();
-    pb_dims->Resize(static_cast<int>(dims.size()), 0);
-    std::copy(dims.begin(), dims.end(), pb_dims->begin());
-    int32_t size = desc.ByteSize();
-    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
-    auto out = desc.SerializeAsString();
-    os.write(out.data(), size);
-  }
-  {  // the 3rd field, tensor data
-    uint64_t size = tensor.memory_size();
-    auto* data_ptr = tensor.data<void>();
-    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
-                   "Index overflow when writing tensor");
-    if (platform::is_gpu_place(tensor.place())) {
-#ifdef PADDLE_WITH_CUDA
-      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
-      std::unique_ptr<char[]> buf(new char[kBufSize]);
-      auto& gpu_dev_ctx =
-          static_cast<const platform::CUDADeviceContext&>(dev_ctx);
-      platform::CPUPlace cpu;
-      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
-      while (size != 0) {
-        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
-        memory::Copy(cpu, buf.get(),
-                     boost::get<platform::CUDAPlace>(tensor.place()),
-                     reinterpret_cast<const void*>(data), size_to_write,
-                     gpu_dev_ctx.stream());
-        gpu_dev_ctx.Wait();
-        os.write(buf.get(), size_to_write);
-        data += size_to_write;
-        size -= size_to_write;
-      }
-#else
-      PADDLE_THROW("Unexpected branch");
-#endif
-    } else {
-      os.write(static_cast<const char*>(data_ptr),
-               static_cast<std::streamsize>(size));
-    }
-  }
-}
-
-struct DeserializedDataFunctor {
-  DeserializedDataFunctor(void** buf, Tensor* tensor,
-                          const platform::Place& place)
-      : buf_(buf), tensor_(tensor), place_(place) {}
-
-  template <typename T>
-  void operator()() {
-    *buf_ = tensor_->mutable_data<T>(place_);
-  }
-
-  void** buf_;
-  Tensor* tensor_;
-  platform::Place place_;
-};
-
-inline void DeserializeFromStream(std::istream& is, Tensor* tensor,
-                                  const platform::DeviceContext& dev_ctx) {
-  uint32_t version;
-  is.read(reinterpret_cast<char*>(&version), sizeof(version));
-  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-  proto::TensorDesc desc;
-  {  // int32_t size
-     // proto buffer
-    int32_t size;
-    is.read(reinterpret_cast<char*>(&size), sizeof(size));
-    std::unique_ptr<char[]> buf(new char[size]);
-    is.read(reinterpret_cast<char*>(buf.get()), size);
-    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
-                   "Cannot parse tensor desc");
-  }
-  {  // read tensor
-    std::vector<int64_t> dims;
-    dims.reserve(static_cast<size_t>(desc.dims().size()));
-    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
-    tensor->Resize(framework::make_ddim(dims));
-    void* buf;
-    auto ctx = platform::CPUDeviceContext();
-    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-      Tensor cpu_tensor;
-      cpu_tensor.Resize(framework::make_ddim(dims));
-      framework::VisitDataType(
-          desc.data_type(),
-          DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
-      is.read(static_cast<char*>(buf), cpu_tensor.memory_size());
-      auto dst_place = dev_ctx.GetPlace();
-      framework::Copy(cpu_tensor, dst_place, dev_ctx, tensor);
-#else
-      PADDLE_THROW("Unexpected branch");
-#endif
-    } else {
-      framework::VisitDataType(
-          desc.data_type(),
-          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
-      is.read(static_cast<char*>(buf), tensor->memory_size());
-    }
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc
deleted file mode 100644
index 906b0b5656301eebbb9f61fad2a3cb4e464a83e8..0000000000000000000000000000000000000000
--- a/paddle/framework/tensor_util_test.cc
+++ /dev/null
@@ -1,309 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/framework/tensor_util.h"
-#include <gtest/gtest.h>
-#include <cmath>
-#include <string>
-
-namespace paddle {
-namespace framework {
-
-TEST(Copy, Tensor) {
-  Tensor src_tensor;
-  Tensor dst_tensor;
-  platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
-
-  int* src_ptr =
-      src_tensor.mutable_data<int>(make_ddim({3, 3}), platform::CPUPlace());
-
-  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-  memcpy(src_ptr, arr, 9 * sizeof(int));
-  src_tensor.set_layout(DataLayout::kAnyLayout);
-
-  auto cpu_place = new platform::CPUPlace();
-  Copy(src_tensor, *cpu_place, &dst_tensor);
-
-  const int* dst_ptr = dst_tensor.data<int>();
-  ASSERT_NE(src_ptr, dst_ptr);
-  for (size_t i = 0; i < 9; ++i) {
-    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-  }
-
-  EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
-
-  Tensor slice_tensor = src_tensor.Slice(1, 2);
-  Copy(slice_tensor, *cpu_place, &dst_tensor);
-  const int* slice_ptr = slice_tensor.data<int>();
-  dst_ptr = dst_tensor.data<int>();
-  ASSERT_NE(dst_ptr, slice_ptr);
-  for (size_t i = 0; i < 3; ++i) {
-    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
-  }
-  EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    Tensor src_tensor;
-    Tensor gpu_tensor;
-    Tensor dst_tensor;
-
-    int* src_ptr =
-        src_tensor.mutable_data<int>(make_ddim({3, 3}), platform::CPUPlace());
-
-    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    memcpy(src_ptr, arr, 9 * sizeof(int));
-
-    // CPU Tensor to GPU Tensor
-    auto gpu_place = new platform::CUDAPlace(0);
-    platform::CUDADeviceContext gpu_ctx(*gpu_place);
-    Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
-
-    // GPU Tensor to CPU Tensor
-    auto cpu_place = new platform::CPUPlace();
-    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
-
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    Tensor slice_tensor = src_tensor.Slice(1, 2);
-
-    // CPU Slice Tensor to GPU Tensor
-    Copy(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
-
-    // GPU Tensor to CPU Tensor
-    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
-
-    // Sync before Compare Slice Tensors
-    gpu_ctx.Wait();
-    const int* slice_ptr = slice_tensor.data<int>();
-    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(dst_ptr, slice_ptr);
-    for (size_t i = 0; i < 3; ++i) {
-      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
-    }
-
-    EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
-  }
-#endif
-}
-
-TEST(CopyFromVector, Tensor) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  {
-    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor cpu_tensor;
-
-    // Copy to CPU Tensor
-    cpu_tensor.Resize(make_ddim({3, 3}));
-    auto cpu_place = new paddle::platform::CPUPlace();
-    CopyFromVector<int>(src_vec, &cpu_tensor);
-
-    // Compare Tensors
-    const int* cpu_ptr = cpu_tensor.data<int>();
-    const int* src_ptr = src_vec.data();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-    }
-
-    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
-    cpu_tensor.Resize(make_ddim({2, 2}));
-    CopyFromVector<int>(src_vec, &cpu_tensor);
-    cpu_ptr = cpu_tensor.data<int>();
-    src_ptr = src_vec.data();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    for (size_t i = 0; i < 5; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-    }
-
-    delete cpu_place;
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor cpu_tensor;
-    Tensor gpu_tensor;
-    Tensor dst_tensor;
-
-    // Copy to CPU Tensor
-    cpu_tensor.Resize(make_ddim({3, 3}));
-    auto cpu_place = new paddle::platform::CPUPlace();
-    CPUDeviceContext cpu_ctx(*cpu_place);
-    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
-
-    // Copy to GPUTensor
-    gpu_tensor.Resize(make_ddim({3, 3}));
-    auto gpu_place = new paddle::platform::CUDAPlace();
-    CUDADeviceContext gpu_ctx(*gpu_place);
-    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
-    // Copy from GPU to CPU tensor for comparison
-    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
-
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    const int* src_ptr = src_vec.data();
-    const int* cpu_ptr = cpu_tensor.data<int>();
-    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
-
-    cpu_tensor.Resize(make_ddim({2, 2}));
-    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
-    gpu_tensor.Resize(make_ddim({2, 2}));
-    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
-    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
-
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    src_ptr = src_vec.data();
-    cpu_ptr = cpu_tensor.data<int>();
-    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 5; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    delete cpu_place;
-    delete gpu_place;
-  }
-#endif
-}
-
-TEST(CopyToVector, Tensor) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  {
-    Tensor src;
-    int* src_ptr = src.mutable_data<int>({3, 3}, CPUPlace());
-    for (int i = 0; i < 3 * 3; ++i) {
-      src_ptr[i] = i;
-    }
-
-    CPUPlace place;
-    std::vector<int> dst;
-    CopyToVector<int>(src, &dst);
-
-    for (int i = 0; i < 3 * 3; ++i) {
-      EXPECT_EQ(src_ptr[i], dst[i]);
-    }
-  }
-#ifdef PADDLE_WITH_CUDA
-  {
-    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor gpu_tensor;
-    CUDAPlace place;
-    CUDADeviceContext gpu_ctx(place);
-    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
-
-    std::vector<int> dst;
-    CopyToVector<int>(gpu_tensor, gpu_ctx, &dst);
-
-    for (int i = 0; i < 3 * 3; ++i) {
-      EXPECT_EQ(src_vec[i], dst[i]);
-    }
-  }
-#endif
-}
-
-TEST(HasNAN, CPU) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  Tensor src;
-  float* buf = src.mutable_data<float>({3}, CPUPlace());
-  buf[0] = 0.0;
-  buf[1] = NAN;
-  buf[2] = 0.0;
-
-  ASSERT_TRUE(HasNAN(src));
-}
-
-TEST(HasInf, CPU) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  Tensor src;
-  double* buf = src.mutable_data<double>({3}, CPUPlace());
-  buf[0] = 1.0;
-  buf[1] = INFINITY;
-  buf[2] = 0.0;
-  ASSERT_TRUE(HasInf(src));
-}
-
-TEST(Tensor, SerializeAndDeserialize) {
-  framework::Tensor src_tensor;
-  int array[6] = {1, 2, 3, 4, 5, 6};
-  src_tensor.Resize({2, 3});
-  int* src_ptr = src_tensor.mutable_data<int>(platform::CPUPlace());
-  for (int i = 0; i < 6; ++i) {
-    src_ptr[i] = array[i];
-  }
-  {
-    framework::Tensor dst_tensor;
-    auto place = new platform::CPUPlace();
-    platform::CPUDeviceContext cpu_ctx(*place);
-    std::ostringstream oss;
-    SerializeToStream(oss, src_tensor, cpu_ctx);
-
-    std::istringstream iss(oss.str());
-    DeserializeFromStream(iss, &dst_tensor, cpu_ctx);
-    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
-    for (int i = 0; i < 5; ++i) {
-      ASSERT_EQ(dst_ptr[i], array[i]);
-    }
-    ASSERT_EQ(dst_tensor.dims(), src_tensor.dims());
-    delete place;
-  }
-#ifdef PADDLE_WITH_CUDA
-  {
-    Tensor gpu_tensor;
-    gpu_tensor.Resize({2, 3});
-    Tensor dst_tensor;
-
-    auto gpu_place = new platform::CUDAPlace();
-    platform::CUDADeviceContext gpu_ctx(*gpu_place);
-
-    Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
-
-    std::ostringstream oss;
-    SerializeToStream(oss, gpu_tensor, gpu_ctx);
-
-    std::istringstream iss(oss.str());
-    DeserializeFromStream(iss, &dst_tensor, gpu_ctx);
-
-    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
-    for (int i = 0; i < 6; ++i) {
-      ASSERT_EQ(dst_ptr[i], array[i]);
-    }
-    delete gpu_place;
-  }
-#endif
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/tensor_util_test.cu b/paddle/framework/tensor_util_test.cu
deleted file mode 100644
index ebd35fdf6c2a1388fec23057070f723c8ef9da9c..0000000000000000000000000000000000000000
--- a/paddle/framework/tensor_util_test.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "gtest/gtest.h"
-#include "paddle/framework/tensor_util.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/place.h"
-
-namespace paddle {
-namespace framework {
-
-static __global__ void FillNAN(float* buf) {
-  buf[0] = 0.0;
-  buf[1] = 0.1;
-  buf[2] = NAN;
-}
-static __global__ void FillInf(float* buf) {
-  buf[0] = 0.0;
-  buf[1] = INFINITY;
-  buf[2] = 0.5;
-}
-
-TEST(HasNAN, GPU) {
-  Tensor tensor;
-  platform::CUDAPlace gpu(0);
-  auto& pool = platform::DeviceContextPool::Instance();
-  auto* cuda_ctx = pool.GetByPlace(gpu);
-  float* buf = tensor.mutable_data<float>({3}, gpu);
-  FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-  cuda_ctx->Wait();
-  ASSERT_TRUE(HasNAN(tensor));
-}
-
-TEST(HasInf, GPU) {
-  Tensor tensor;
-  platform::CUDAPlace gpu(0);
-  auto& pool = platform::DeviceContextPool::Instance();
-  auto* cuda_ctx = pool.GetByPlace(gpu);
-  float* buf = tensor.mutable_data<float>({3}, gpu);
-  FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-  cuda_ctx->Wait();
-  ASSERT_TRUE(HasInf(tensor));
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/threadpool.cc b/paddle/framework/threadpool.cc
deleted file mode 100644
index b7d7c00bcf9d9770f58284023ca2defcda299d64..0000000000000000000000000000000000000000
--- a/paddle/framework/threadpool.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/framework/threadpool.h"
-
-#include "paddle/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-std::unique_ptr<ThreadPool> ThreadPool::threadpool_(nullptr);
-std::once_flag ThreadPool::init_flag_;
-
-ThreadPool* ThreadPool::GetInstance() {
-  std::call_once(init_flag_, &ThreadPool::Init);
-  return threadpool_.get();
-}
-
-void ThreadPool::Init() {
-  if (threadpool_.get() == nullptr) {
-    // TODO(Yancey1989): specify the max threads number
-    int num_threads = std::thread::hardware_concurrency();
-    PADDLE_ENFORCE_GT(num_threads, 0);
-    threadpool_.reset(new ThreadPool(num_threads));
-  }
-}
-
-ThreadPool::ThreadPool(int num_threads)
-    : total_threads_(num_threads), idle_threads_(num_threads), running_(true) {
-  threads_.resize(num_threads);
-  for (auto& thread : threads_) {
-    // TODO(Yancey1989): binding the thread on the specify CPU number
-    thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
-  }
-}
-
-ThreadPool::~ThreadPool() {
-  {
-    // notify all threads to stop running
-    running_ = false;
-    scheduled_.notify_all();
-  }
-
-  for (auto& t : threads_) {
-    t->join();
-    t.reset(nullptr);
-  }
-}
-
-void ThreadPool::Wait() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  completed_.wait(lock, [=] { return Done() == true; });
-}
-
-void ThreadPool::TaskLoop() {
-  while (running_) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
-
-    if (!running_) {
-      break;
-    }
-    // pop a task from the task queue
-    auto task = std::move(tasks_.front());
-    tasks_.pop();
-
-    --idle_threads_;
-    lock.unlock();
-
-    // run the task
-    task();
-
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      ++idle_threads_;
-      if (Done()) {
-        completed_.notify_all();
-      }
-    }
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h
deleted file mode 100644
index 77d31a1176d5947655b57c70846294093a7bb5ef..0000000000000000000000000000000000000000
--- a/paddle/framework/threadpool.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <condition_variable>
-#include <functional>
-#include <future>
-#include <mutex>
-#include <queue>
-#include <thread>
-#include <vector>
-#include "glog/logging.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
-
-namespace paddle {
-namespace framework {
-
-// ThreadPool maintains a queue of tasks, and runs them using a fixed
-// number of threads.
-class ThreadPool {
- public:
-  using Task = std::packaged_task<std::unique_ptr<platform::EnforceNotMet>()>;
-
-  // Returns the singleton of ThreadPool.
-  static ThreadPool* GetInstance();
-
-  ~ThreadPool();
-
-  // Returns the number of threads created by the constructor.
-  size_t Threads() const { return total_threads_; }
-
-  // Returns the number of currently idle threads.
-  size_t IdleThreads() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    return idle_threads_;
-  }
-
-  // Run pushes a function to the task queue and returns a std::future
-  // object.  To wait for the completion of the task, call
-  // std::future::wait().
-  template <typename Callback>
-  std::future<void> Run(Callback fn) {
-    auto f = this->RunAndGetException(fn);
-    return std::async(std::launch::deferred, ExceptionHandler(std::move(f)));
-  }
-
-  template <typename Callback>
-  std::future<std::unique_ptr<platform::EnforceNotMet>> RunAndGetException(
-      Callback fn) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    Task task([fn]() -> std::unique_ptr<platform::EnforceNotMet> {
-      try {
-        fn();
-        return nullptr;
-      } catch (platform::EnforceNotMet ex) {
-        return std::unique_ptr<platform::EnforceNotMet>(
-            new platform::EnforceNotMet(ex));
-      } catch (...) {
-        LOG(FATAL)
-            << "Unexpected exception is catched in thread pool. All "
-               "throwable exception in Fluid should be an EnforceNotMet.";
-      }
-    });
-    std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
-    tasks_.push(std::move(task));
-    lock.unlock();
-    scheduled_.notify_one();
-    return f;
-  }
-
-  // Wait until all the tasks are completed.
-  void Wait();
-
- private:
-  struct ExceptionHandler {
-    mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
-    explicit ExceptionHandler(
-        std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
-        : future_(std::move(f)) {}
-    void operator()() const {
-      auto ex = this->future_.get();
-      if (ex != nullptr) {
-        LOG(FATAL) << "The exception is thrown inside the thread pool. You "
-                      "should use RunAndGetException to handle the exception.\n"
-                      "The default exception handler is LOG(FATAL)."
-                   << ex->what();
-      }
-    }
-  };
-
-  DISABLE_COPY_AND_ASSIGN(ThreadPool);
-
-  explicit ThreadPool(int num_threads);
-
-  // If the task queue is empty and avaialbe is equal to the number of
-  // threads, means that all tasks are completed.  Note: this function
-  // is not thread-safe.  Returns true if all tasks are completed.
-  // Note: don't delete the data member total_threads_ and use
-  // threads_.size() instead; because you'd need to lock the mutex
-  // before accessing threads_.
-  bool Done() { return tasks_.empty() && idle_threads_ == total_threads_; }
-
-  // The constructor starts threads to run TaskLoop, which retrieves
-  // and runs tasks from the queue.
-  void TaskLoop();
-
-  // Init is called by GetInstance.
-  static void Init();
-
- private:
-  static std::unique_ptr<ThreadPool> threadpool_;
-  static std::once_flag init_flag_;
-
-  std::vector<std::unique_ptr<std::thread>> threads_;
-  const size_t total_threads_;
-  size_t idle_threads_;
-
-  std::queue<Task> tasks_;
-  std::mutex mutex_;
-  bool running_;
-  std::condition_variable scheduled_;
-  std::condition_variable completed_;
-};
-
-// Run a function asynchronously.
-// NOTE: The function must return void. If the function need to return a value,
-// you can use lambda to capture a value pointer.
-template <typename Callback>
-std::future<void> Async(Callback callback) {
-  return ThreadPool::GetInstance()->Run(callback);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/type_defs.h b/paddle/framework/type_defs.h
deleted file mode 100644
index 1eedbbc419ab660f5ce00aa891ef80ca245bc0a8..0000000000000000000000000000000000000000
--- a/paddle/framework/type_defs.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "paddle/platform/variant.h"
-
-namespace paddle {
-namespace framework {
-class OperatorBase;
-class OpDesc;
-class InferShapeContext;
-class BlockDesc;
-
-using VariableNameMap = std::map<std::string, std::vector<std::string>>;
-
-// The order should be as same as framework.proto
-using Attribute =
-    boost::variant<boost::blank, int, float, std::string, std::vector<int>,
-                   std::vector<float>, std::vector<std::string>, bool,
-                   std::vector<bool>, BlockDesc*, int64_t>;
-
-using AttributeMap = std::unordered_map<std::string, Attribute>;
-
-using OpCreator = std::function<OperatorBase*(
-    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
-    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
-
-using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDesc>>(
-    const OpDesc&, const std::unordered_set<std::string>& /*no_grad_set*/,
-    std::unordered_map<std::string, std::string>* /*grad_to_var*/,
-    const std::vector<BlockDesc*>& grad_block)>;
-
-using InferVarTypeFN =
-    std::function<void(const OpDesc& /*op_desc*/, BlockDesc* /*block*/)>;
-
-using InferShapeFN = std::function<void(InferShapeContext*)>;
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
deleted file mode 100644
index 11a4daf2c991fc85a65c242403a0c83d06c4c44c..0000000000000000000000000000000000000000
--- a/paddle/framework/var_desc.cc
+++ /dev/null
@@ -1,259 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/var_desc.h"
-#include "paddle/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-proto::VarDesc::VarType VarDesc::GetType() const { return desc_.type(); }
-
-void VarDesc::SetType(proto::VarDesc::VarType type) { desc_.set_type(type); }
-
-void VarDesc::SetShape(const std::vector<int64_t> &dims) {
-  VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
-}
-
-void VarDesc::SetTensorDescNum(size_t num) {
-  switch (desc_.type()) {
-    case proto::VarDesc::READER: {
-      auto *lod_tensors_ptr = desc_.mutable_reader()->mutable_lod_tensor();
-      lod_tensors_ptr->Clear();
-      for (size_t i = 0; i < num; ++i) {
-        lod_tensors_ptr->Add();
-      }
-      return;
-    } break;
-    default:
-      PADDLE_THROW(
-          "Setting 'sub_tensor_number' is not supported by the type of var %s.",
-          this->Name());
-  }
-}
-
-size_t VarDesc::GetTensorDescNum() const {
-  switch (desc_.type()) {
-    case proto::VarDesc::READER:
-      return desc_.reader().lod_tensor_size();
-      break;
-    default:
-      PADDLE_THROW(
-          "Getting 'sub_tensor_number' is not supported by the type of var %s.",
-          this->Name());
-  }
-}
-
-void VarDesc::SetShapes(
-    const std::vector<std::vector<int64_t>> &multiple_dims) {
-  if (multiple_dims.size() != GetTensorDescNum()) {
-    VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size()
-            << ") doesn't match the existing tensor number("
-            << GetTensorDescNum()
-            << "). The Reader is going to be reinitialized.";
-    SetTensorDescNum(multiple_dims.size());
-  }
-  std::vector<proto::TensorDesc *> tensors = mutable_tensor_descs();
-  for (size_t i = 0; i < multiple_dims.size(); ++i) {
-    VectorToRepeated(multiple_dims[i], tensors[i]->mutable_dims());
-  }
-}
-
-std::vector<int64_t> VarDesc::GetShape() const {
-  return RepeatedToVector(tensor_desc().dims());
-}
-
-std::vector<std::vector<int64_t>> VarDesc::GetShapes() const {
-  std::vector<proto::TensorDesc> descs = tensor_descs();
-  std::vector<std::vector<int64_t>> res;
-  res.reserve(descs.size());
-  for (const auto &tensor_desc : descs) {
-    res.push_back(RepeatedToVector(tensor_desc.dims()));
-  }
-  return res;
-}
-
-void VarDesc::SetDataType(proto::DataType data_type) {
-  mutable_tensor_desc()->set_data_type(data_type);
-}
-
-void VarDesc::SetDataTypes(
-    const std::vector<proto::DataType> &multiple_data_type) {
-  if (multiple_data_type.size() != GetTensorDescNum()) {
-    VLOG(3) << "WARNING: The number of given data types("
-            << multiple_data_type.size()
-            << ") doesn't match the existing tensor number("
-            << GetTensorDescNum()
-            << "). The Reader is going to be reinitialized.";
-    SetTensorDescNum(multiple_data_type.size());
-  }
-  std::vector<proto::TensorDesc *> tensor_descs = mutable_tensor_descs();
-  for (size_t i = 0; i < multiple_data_type.size(); ++i) {
-    tensor_descs[i]->set_data_type(multiple_data_type[i]);
-  }
-}
-
-proto::DataType VarDesc::GetDataType() const {
-  return tensor_desc().data_type();
-}
-
-std::vector<proto::DataType> VarDesc::GetDataTypes() const {
-  std::vector<proto::TensorDesc> descs = tensor_descs();
-  std::vector<proto::DataType> res;
-  res.reserve(descs.size());
-  for (const auto &tensor_desc : descs) {
-    res.push_back(tensor_desc.data_type());
-  }
-  return res;
-}
-
-void VarDesc::SetLoDLevel(int32_t lod_level) {
-  switch (desc_.type()) {
-    case proto::VarDesc::LOD_TENSOR:
-      desc_.mutable_lod_tensor()->set_lod_level(lod_level);
-      break;
-    case proto::VarDesc::LOD_TENSOR_ARRAY:
-      desc_.mutable_tensor_array()->set_lod_level(lod_level);
-      break;
-    default:
-      PADDLE_THROW(
-          "Setting 'lod_level' is not supported by the type of var %s.",
-          this->Name());
-  }
-}
-
-void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
-  if (multiple_lod_level.size() != GetTensorDescNum()) {
-    VLOG(3) << "WARNING: The number of given lod_levels("
-            << multiple_lod_level.size()
-            << ") doesn't match the existing tensor number("
-            << GetTensorDescNum()
-            << "). The Reader is going to be reinitialized.";
-    SetTensorDescNum(multiple_lod_level.size());
-  }
-  switch (desc_.type()) {
-    case proto::VarDesc::READER: {
-      size_t i = 0;
-      for (auto &lod_tensor : *desc_.mutable_reader()->mutable_lod_tensor()) {
-        lod_tensor.set_lod_level(multiple_lod_level[i++]);
-      }
-    } break;
-    default:
-      PADDLE_THROW(
-          "Setting 'lod_levels' is not supported by the type of var %s.",
-          this->Name());
-  }
-}
-
-int32_t VarDesc::GetLoDLevel() const {
-  switch (desc_.type()) {
-    case proto::VarDesc::LOD_TENSOR:
-      return desc_.lod_tensor().lod_level();
-    case proto::VarDesc::LOD_TENSOR_ARRAY:
-      return desc_.tensor_array().lod_level();
-    default:
-      PADDLE_THROW(
-          "Getting 'lod_level' is not supported by the type of var %s.",
-          this->Name());
-  }
-}
-
-std::vector<int32_t> VarDesc::GetLoDLevels() const {
-  std::vector<int32_t> res;
-  switch (desc_.type()) {
-    case proto::VarDesc::READER:
-      res.reserve(desc_.reader().lod_tensor_size());
-      for (auto &lod_tensor : desc_.reader().lod_tensor()) {
-        res.push_back(lod_tensor.lod_level());
-      }
-      return res;
-      break;
-    default:
-      PADDLE_THROW(
-          "Getting 'lod_levels' is not supported by the type of var %s.",
-          this->Name());
-  }
-}
-
-const proto::TensorDesc &VarDesc::tensor_desc() const {
-  PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
-  switch (desc_.type()) {
-    case proto::VarDesc::SELECTED_ROWS:
-      return desc_.selected_rows();
-    case proto::VarDesc::LOD_TENSOR:
-      return desc_.lod_tensor().tensor();
-    case proto::VarDesc::LOD_TENSOR_ARRAY:
-      return desc_.tensor_array().tensor();
-    default:
-      PADDLE_THROW(
-          "Getting 'tensor_desc' is not supported by the type of var %s.",
-          this->Name());
-  }
-}
-
-std::vector<proto::TensorDesc> VarDesc::tensor_descs() const {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
-  std::vector<proto::TensorDesc> res;
-  res.reserve(GetTensorDescNum());
-  switch (desc_.type()) {
-    case proto::VarDesc::READER:
-      for (const auto &lod_tensor : desc_.reader().lod_tensor()) {
-        res.push_back(lod_tensor.tensor());
-      }
-      return res;
-    default:
-      PADDLE_THROW(
-          "Getting 'tensor_descs' is not supported by the type of var "
-          "%s.",
-          this->Name());
-  }
-}
-
-proto::TensorDesc *VarDesc::mutable_tensor_desc() {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
-  switch (desc_.type()) {
-    case proto::VarDesc::SELECTED_ROWS:
-      return desc_.mutable_selected_rows();
-    case proto::VarDesc::LOD_TENSOR:
-      return desc_.mutable_lod_tensor()->mutable_tensor();
-    case proto::VarDesc::LOD_TENSOR_ARRAY:
-      return desc_.mutable_tensor_array()->mutable_tensor();
-    default:
-      PADDLE_THROW(
-          "Getting 'mutable_tensor_desc' is not supported by the type of var "
-          "%s.",
-          this->Name());
-  }
-}
-
-std::vector<proto::TensorDesc *> VarDesc::mutable_tensor_descs() {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
-  std::vector<proto::TensorDesc *> res;
-  res.reserve(GetTensorDescNum());
-  switch (desc_.type()) {
-    case proto::VarDesc::READER:
-      for (auto &lod_tensor : *desc_.mutable_reader()->mutable_lod_tensor()) {
-        res.push_back(lod_tensor.mutable_tensor());
-      }
-      return res;
-    default:
-      PADDLE_THROW(
-          "Getting 'tensor_descs' is not supported by the type of var "
-          "%s.",
-          this->Name());
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/var_desc.h b/paddle/framework/var_desc.h
deleted file mode 100644
index 72da2fbb0a66ec7ca8c0c274dc4273bfbfcf303e..0000000000000000000000000000000000000000
--- a/paddle/framework/var_desc.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "glog/logging.h"
-#include "paddle/framework/framework.pb.h"
-
-namespace paddle {
-namespace framework {
-
-// convert between std::vector and protobuf repeated.
-template <typename T>
-inline std::vector<T> RepeatedToVector(
-    const google::protobuf::RepeatedField<T> &repeated_field) {
-  std::vector<T> ret;
-  ret.reserve(repeated_field.size());
-  std::copy(repeated_field.begin(), repeated_field.end(),
-            std::back_inserter(ret));
-  return ret;
-}
-
-template <typename T, typename RepeatedField>
-inline void VectorToRepeated(const std::vector<T> &vec,
-                             RepeatedField *repeated_field) {
-  repeated_field->Clear();
-  repeated_field->Reserve(vec.size());
-  for (const auto &elem : vec) {
-    *repeated_field->Add() = elem;
-  }
-}
-
-// Specialize vector<bool>.
-template <typename RepeatedField>
-inline void VectorToRepeated(const std::vector<bool> &vec,
-                             RepeatedField *repeated_field) {
-  repeated_field->Clear();
-  repeated_field->Reserve(vec.size());
-  for (auto elem : vec) {
-    *repeated_field->Add() = elem;
-  }
-}
-
-class VarDesc {
- public:
-  explicit VarDesc(const std::string &name) {
-    desc_.set_name(name);
-    desc_.set_type(proto::VarDesc::LOD_TENSOR);
-  }
-
-  explicit VarDesc(const proto::VarDesc &desc) : desc_(desc) {}
-
-  proto::VarDesc *Proto() { return &desc_; }
-
-  std::string Name() const { return desc_.name(); }
-
-  void SetName(std::string name) { desc_.set_name(name); }
-
-  void SetTensorDescNum(size_t num);
-
-  size_t GetTensorDescNum() const;
-
-  void SetShape(const std::vector<int64_t> &dims);
-
-  void SetShapes(const std::vector<std::vector<int64_t>> &multiple_dims);
-
-  std::vector<int64_t> GetShape() const;
-
-  std::vector<std::vector<int64_t>> GetShapes() const;
-
-  void SetDataType(proto::DataType data_type);
-
-  void SetDataTypes(const std::vector<proto::DataType> &multiple_data_type);
-
-  proto::DataType GetDataType() const;
-
-  std::vector<proto::DataType> GetDataTypes() const;
-
-  void SetLoDLevel(int32_t lod_level);
-
-  void SetLoDLevels(const std::vector<int32_t> &multiple_lod_level);
-
-  int32_t GetLoDLevel() const;
-
-  std::vector<int32_t> GetLoDLevels() const;
-
-  proto::VarDesc::VarType GetType() const;
-
-  void SetType(proto::VarDesc::VarType type);
-
-  bool Persistable() const { return desc_.persistable(); }
-
-  void SetPersistable(bool persistable) { desc_.set_persistable(persistable); }
-
- private:
-  const proto::TensorDesc &tensor_desc() const;
-  std::vector<proto::TensorDesc> tensor_descs() const;
-  proto::TensorDesc *mutable_tensor_desc();
-  std::vector<proto::TensorDesc *> mutable_tensor_descs();
-
-  proto::VarDesc desc_;
-};
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/var_type.h b/paddle/framework/var_type.h
deleted file mode 100644
index 599d45149024ca0fb395c2a1c6deeb7d8cd5eb17..0000000000000000000000000000000000000000
--- a/paddle/framework/var_type.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/reader.h"
-#include "paddle/framework/selected_rows.h"
-#include "paddle/framework/variable.h"
-
-namespace paddle {
-namespace framework {
-inline proto::VarDesc::VarType ToVarType(std::type_index type) {
-  if (type.hash_code() == typeid(LoDTensor).hash_code()) {
-    return proto::VarDesc_VarType_LOD_TENSOR;
-  } else if (type.hash_code() == typeid(LoDRankTable).hash_code()) {
-    return proto::VarDesc_VarType_LOD_RANK_TABLE;
-  } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
-    return proto::VarDesc_VarType_LOD_TENSOR_ARRAY;
-  } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
-    return proto::VarDesc_VarType_SELECTED_ROWS;
-  } else if (type.hash_code() == typeid(ReaderHolder).hash_code()) {
-    return proto::VarDesc_VarType_READER;
-  } else {
-    PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
-  }
-}
-
-template <typename Visitor>
-inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
-  switch (ToVarType(var.Type())) {
-    case proto::VarDesc_VarType_LOD_TENSOR:
-      visitor(var.Get<LoDTensor>());
-      return;
-    case proto::VarDesc_VarType_LOD_RANK_TABLE:
-      visitor(var.Get<LoDRankTable>());
-      return;
-    case proto::VarDesc_VarType_LOD_TENSOR_ARRAY:
-      visitor(var.Get<LoDTensorArray>());
-      return;
-    case proto::VarDesc_VarType_SELECTED_ROWS:
-      visitor(var.Get<SelectedRows>());
-      return;
-    case proto::VarDesc_VarType_READER:
-      visitor(var.Get<ReaderHolder>());
-      return;
-    default:
-      PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/var_type_inference.h b/paddle/framework/var_type_inference.h
deleted file mode 100644
index 6c11f2fee7f554fb008f559bb33aeafea5c5a556..0000000000000000000000000000000000000000
--- a/paddle/framework/var_type_inference.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/type_defs.h"
-
-namespace paddle {
-namespace framework {
-
-class VarTypeInference {
- public:
-  virtual ~VarTypeInference() {}
-  virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/var_type_inference_test.cc b/paddle/framework/var_type_inference_test.cc
deleted file mode 100644
index fa6018b1c583abaccb0259b82c9bb61c0fc10820..0000000000000000000000000000000000000000
--- a/paddle/framework/var_type_inference_test.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/var_type_inference.h"
-#include "gtest/gtest.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/program_desc.h"
-
-namespace paddle {
-namespace framework {
-
-class SumOpMaker : public OpProtoAndCheckerMaker {
- public:
-  SumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "").AsDuplicable();
-    AddOutput("Out", "");
-    AddComment("");
-  }
-};
-
-class SumOpVarTypeInference : public VarTypeInference {
- public:
-  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
-    auto &inputs = op_desc.Input("X");
-    auto default_var_type = proto::VarDesc::SELECTED_ROWS;
-
-    bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [block](const std::string &name) {
-          return block->Var(name)->GetType() == proto::VarDesc::LOD_TENSOR;
-        });
-    if (any_input_is_lod_tensor) {
-      default_var_type = proto::VarDesc::LOD_TENSOR;
-    }
-
-    auto out_var_name = op_desc.Output("Out").front();
-    block->Var(out_var_name)->SetType(default_var_type);
-  }
-};
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker,
-                  paddle::framework::SumOpVarTypeInference);
-REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP,
-                  paddle::framework::SumOpMaker);
-
-namespace paddle {
-namespace framework {
-
-TEST(InferVarType, sum_op) {
-  ProgramDesc prog;
-  auto *op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("sum");
-  op->SetInput("X", {"test_a", "test_b", "test_c"});
-  op->SetOutput("Out", {"test_out"});
-
-  prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test_c")->SetType(proto::VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test_out");
-
-  op->InferVarType(prog.MutableBlock(0));
-
-  ASSERT_EQ(proto::VarDesc::SELECTED_ROWS,
-            prog.MutableBlock(0)->Var("test_out")->GetType());
-
-  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarDesc::LOD_TENSOR);
-  op->InferVarType(prog.MutableBlock(0));
-  ASSERT_EQ(proto::VarDesc::LOD_TENSOR,
-            prog.MutableBlock(0)->Var("test_out")->GetType());
-}
-
-TEST(InferVarType, sum_op_without_infer_var_type) {
-  ProgramDesc prog;
-  auto *op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("sum_without_infer_var_type");
-  op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
-  op->SetOutput("Out", {"test2_out"});
-
-  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test2_out");
-
-  op->InferVarType(prog.MutableBlock(0));
-
-  ASSERT_EQ(proto::VarDesc_VarType_LOD_TENSOR,
-            prog.MutableBlock(0)->Var("test2_out")->GetType());
-}
-
-}  // namespace framework
-}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h
deleted file mode 100644
index 3b7ec0a2a90d8f88bfb7f1629f484b3a8a8078df..0000000000000000000000000000000000000000
--- a/paddle/framework/variable.h
+++ /dev/null
@@ -1,95 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <memory>
-#include <typeindex>
-#include <typeinfo>
-
-#include "paddle/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-class Variable {
- public:
-  template <typename T>
-  const T& Get() const {
-    PADDLE_ENFORCE(holder_ != nullptr, "Variable must hold some thing");
-    PADDLE_ENFORCE(IsType<T>(),
-                   "Variable must be type %s, the holding type is %s",
-                   typeid(T).name(), holder_->Type().name());
-    return *static_cast<const T*>(holder_->Ptr());
-  }
-
-  bool IsInitialized() const { return holder_ != nullptr; }
-
-  template <typename T>
-  T* GetMutable() {
-    if (!IsType<T>()) {
-      holder_.reset(new PlaceholderImpl<T>(new T()));
-    }
-    return static_cast<T*>(holder_->Ptr());
-  }
-
-  template <typename T>
-  bool IsType() const {
-    return holder_ != nullptr &&
-           std::type_index(typeid(T)) == std::type_index(holder_->Type());
-  }
-
-  void Clear() { holder_.reset(); }
-
-  std::type_index Type() const {
-    PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory");
-    return holder_->Type();
-  }
-
- private:
-  struct Placeholder {
-    virtual ~Placeholder() {}
-    virtual const std::type_info& Type() const = 0;
-    virtual void* Ptr() const = 0;
-  };
-
-  // Placeholder hides type T, so it doesn't appear as a template
-  // parameter of Variable.
-  template <typename T>
-  struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {}
-
-    virtual const std::type_info& Type() const { return type_; }
-    virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
-
-    std::unique_ptr<T> ptr_;
-    const std::type_info& type_;
-  };
-
-  std::unique_ptr<Placeholder>
-      holder_;  // pointers to a PlaceholderImpl object indeed.
-
-  // name_ is only meaningful with a Scope and accessible by it.
-  //
-  // NOTE: Please don't expose name_ by adding methods like
-  // Variable::Name or Scope::VarName!  A variable could have a human
-  // readable name or an auto-generated scope-unique name.  In the
-  // former case, the caller knows the name and doesn't need to access
-  // the name; in the latter case, the variable should be identified
-  // by its address but not the unreadable name.
-  friend class Scope;
-  const std::string* name_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/variable_test.cc b/paddle/framework/variable_test.cc
deleted file mode 100644
index e5585c8724d712e273d086001b6cbc3d59c46ebe..0000000000000000000000000000000000000000
--- a/paddle/framework/variable_test.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <string>
-
-#include "gtest/gtest.h"
-#include "paddle/framework/variable.h"
-
-TEST(Variable, GetMutable) {
-  using paddle::framework::Variable;
-
-  struct Tensor {
-    int content_;
-  };
-
-  std::unique_ptr<Variable> v(new Variable());
-
-  Tensor* t = v->GetMutable<Tensor>();
-  t->content_ = 1234;
-
-  const Tensor& tt = v->Get<Tensor>();
-  EXPECT_EQ(1234, tt.content_);
-
-  std::string* s = v->GetMutable<std::string>();
-  *s = "hello";
-
-  const std::string& ss = v->Get<std::string>();
-  EXPECT_EQ("hello", ss);
-}
diff --git a/paddle/inference/CMakeLists.txt b/paddle/inference/CMakeLists.txt
deleted file mode 100644
index 654a6119bdc85f43b0cae631a9dc8f0ccd758889..0000000000000000000000000000000000000000
--- a/paddle/inference/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init)
-
-cc_library(paddle_fluid_api
-    SRCS io.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
-
-# Merge all modules into a single static library
-cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
-
-# Create shared library
-add_library(paddle_fluid_shared SHARED io.cc)
-
-target_circle_link_libraries(paddle_fluid_shared
-  ARCHIVE_START
-  ${GLOB_OP_LIB}
-  ${FLUID_CORE_MODULES}
-  ARCHIVE_END)
-
-SET_TARGET_PROPERTIES(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
-
-if(WITH_TESTING)
-  add_subdirectory(tests/book)
-endif()
diff --git a/paddle/inference/io.cc b/paddle/inference/io.cc
deleted file mode 100644
index 784e87970f77857e7f3182df904dc0133c44d6c9..0000000000000000000000000000000000000000
--- a/paddle/inference/io.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/inference/io.h"
-
-#include <fstream>
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/feed_fetch_type.h"
-
-namespace paddle {
-namespace inference {
-
-void ReadBinaryFile(const std::string& filename, std::string& contents) {
-  VLOG(3) << "loading model from " << filename;
-  std::ifstream inputfs(filename, std::ios::in | std::ios::binary);
-  inputfs.seekg(0, std::ios::end);
-  contents.clear();
-  contents.resize(inputfs.tellg());
-  inputfs.seekg(0, std::ios::beg);
-  inputfs.read(&contents[0], contents.size());
-  inputfs.close();
-}
-
-bool IsParameter(const framework::VarDesc* var,
-                 const framework::ProgramDesc& main_program) {
-  if (var->Persistable()) {
-    // There are many unreachable variables in the program
-    for (size_t i = 0; i < main_program.Size(); ++i) {
-      const framework::BlockDesc& block = main_program.Block(i);
-      for (auto* op : block.AllOps()) {
-        if (op->Type() == framework::kFeedOpType) {
-          continue;
-        }
-        for (auto input_argument_name : op->InputArgumentNames()) {
-          if (input_argument_name == var->Name()) {
-            return true;
-          }
-        }
-      }
-    }
-  }
-  return false;
-}
-
-void LoadPersistables(framework::Executor& executor,
-                      framework::Scope& scope,
-                      const framework::ProgramDesc& main_program,
-                      const std::string& dirname,
-                      const std::string& param_filename) {
-  const framework::BlockDesc& global_block = main_program.Block(0);
-
-  framework::ProgramDesc* load_program = new framework::ProgramDesc();
-  framework::BlockDesc* load_block = load_program->MutableBlock(0);
-  std::vector<std::string> paramlist;
-
-  for (auto* var : global_block.AllVars()) {
-    if (IsParameter(var, main_program)) {
-      VLOG(3) << "parameter's name: " << var->Name();
-
-      framework::VarDesc* new_var = load_block->Var(var->Name());
-      new_var->SetShape(var->GetShape());
-      new_var->SetDataType(var->GetDataType());
-      new_var->SetType(var->GetType());
-      new_var->SetLoDLevel(var->GetLoDLevel());
-      new_var->SetPersistable(true);
-
-      if (!param_filename.empty()) {
-        paramlist.push_back(new_var->Name());
-      } else {
-        // append_op
-        framework::OpDesc* op = load_block->AppendOp();
-        op->SetType("load");
-        op->SetOutput("Out", {new_var->Name()});
-        op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
-        op->CheckAttrs();
-      }
-    }
-  }
-
-  if (!param_filename.empty()) {
-    // sort paramlist to have consistent ordering
-    std::sort(paramlist.begin(), paramlist.end());
-    // append just the load_combine op
-    framework::OpDesc* op = load_block->AppendOp();
-    op->SetType("load_combine");
-    op->SetOutput("Out", paramlist);
-    op->SetAttr("file_path", {param_filename});
-    op->CheckAttrs();
-  }
-
-  executor.Run(*load_program, &scope, 0, true, true);
-
-  VLOG(3) << "Ran loading successfully";
-  delete load_program;
-}
-
-std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
-                                             framework::Scope& scope,
-                                             const std::string& dirname) {
-  std::string model_filename = dirname + "/__model__";
-  std::string program_desc_str;
-  ReadBinaryFile(model_filename, program_desc_str);
-
-  std::unique_ptr<framework::ProgramDesc> main_program(
-      new framework::ProgramDesc(program_desc_str));
-
-  LoadPersistables(executor, scope, *main_program, dirname, "");
-  return main_program;
-}
-
-std::unique_ptr<framework::ProgramDesc> Load(
-    framework::Executor& executor,
-    framework::Scope& scope,
-    const std::string& prog_filename,
-    const std::string& param_filename) {
-  std::string model_filename = prog_filename;
-  std::string program_desc_str;
-  ReadBinaryFile(model_filename, program_desc_str);
-
-  std::unique_ptr<framework::ProgramDesc> main_program(
-      new framework::ProgramDesc(program_desc_str));
-
-  LoadPersistables(executor, scope, *main_program, "", param_filename);
-  return main_program;
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/inference/io.h b/paddle/inference/io.h
deleted file mode 100644
index a7d7c499690620740d8627e7f5085d728d67f7c3..0000000000000000000000000000000000000000
--- a/paddle/inference/io.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/framework/executor.h"
-#include "paddle/framework/program_desc.h"
-#include "paddle/framework/scope.h"
-
-namespace paddle {
-namespace inference {
-
-void LoadPersistables(framework::Executor& executor,
-                      framework::Scope& scope,
-                      const framework::ProgramDesc& main_program,
-                      const std::string& dirname,
-                      const std::string& param_filename);
-
-std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
-                                             framework::Scope& scope,
-                                             const std::string& dirname);
-
-std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
-                                             framework::Scope& scope,
-                                             const std::string& prog_filename,
-                                             const std::string& param_filename);
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/inference/tests/book/CMakeLists.txt b/paddle/inference/tests/book/CMakeLists.txt
deleted file mode 100644
index 0a96829bdd20f5dcb0c3fed501d27c27f2f73b17..0000000000000000000000000000000000000000
--- a/paddle/inference/tests/book/CMakeLists.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-function(inference_test TARGET_NAME)
-  set(options "")
-  set(oneValueArgs "")
-  set(multiValueArgs ARGS)
-  cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-  set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
-  if(inference_test_ARGS)
-    foreach(arg ${inference_test_ARGS})
-      cc_test(test_inference_${TARGET_NAME}_${arg}
-          SRCS test_inference_${TARGET_NAME}.cc
-          DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
-          ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}_${arg}.inference.model)
-      set_tests_properties(test_inference_${TARGET_NAME}_${arg}
-          PROPERTIES DEPENDS test_${TARGET_NAME})
-    endforeach()
-  else()
-    cc_test(test_inference_${TARGET_NAME}
-        SRCS test_inference_${TARGET_NAME}.cc
-        DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
-        ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}.inference.model)
-    set_tests_properties(test_inference_${TARGET_NAME}
-        PROPERTIES DEPENDS test_${TARGET_NAME})
-  endif()
-endfunction(inference_test)
-
-inference_test(recognize_digits ARGS mlp)
-inference_test(image_classification ARGS vgg resnet)
-inference_test(label_semantic_roles)
-inference_test(rnn_encoder_decoder)
diff --git a/paddle/inference/tests/book/test_helper.h b/paddle/inference/tests/book/test_helper.h
deleted file mode 100644
index 3e66ced94fe6360f0be948a6838cc37ff2f65eed..0000000000000000000000000000000000000000
--- a/paddle/inference/tests/book/test_helper.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <time.h>
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/inference/io.h"
-
-template <typename T>
-void SetupTensor(paddle::framework::LoDTensor& input,
-                 paddle::framework::DDim dims,
-                 T lower,
-                 T upper) {
-  srand(time(0));
-  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
-  for (int i = 0; i < input.numel(); ++i) {
-    input_ptr[i] =
-        (static_cast<T>(rand()) / static_cast<T>(RAND_MAX)) * (upper - lower) +
-        lower;
-  }
-}
-
-template <typename T>
-void SetupLoDTensor(paddle::framework::LoDTensor& input,
-                    paddle::framework::LoD& lod,
-                    T lower,
-                    T upper) {
-  input.set_lod(lod);
-  int dim = lod[0][lod[0].size() - 1];
-  SetupTensor(input, {dim, 1}, lower, upper);
-}
-
-template <typename T>
-void CheckError(paddle::framework::LoDTensor& output1,
-                paddle::framework::LoDTensor& output2) {
-  // Check lod information
-  EXPECT_EQ(output1.lod(), output2.lod());
-
-  EXPECT_EQ(output1.dims(), output2.dims());
-  EXPECT_EQ(output1.numel(), output2.numel());
-
-  T err = static_cast<T>(0);
-  if (typeid(T) == typeid(float)) {
-    err = 1E-3;
-  } else if (typeid(T) == typeid(double)) {
-    err = 1E-6;
-  } else {
-    err = 0;
-  }
-
-  size_t count = 0;
-  for (int64_t i = 0; i < output1.numel(); ++i) {
-    if (fabs(output1.data<T>()[i] - output2.data<T>()[i]) > err) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
-}
-
-template <typename Place, typename T, bool IsCombined = false>
-void TestInference(const std::string& dirname,
-                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
-                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
-  // 1. Define place, executor, scope and inference_program
-  auto place = Place();
-  auto executor = paddle::framework::Executor(place);
-  auto* scope = new paddle::framework::Scope();
-  std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
-
-  // 2. Initialize the inference_program and load all parameters from file
-  if (IsCombined) {
-    // Hard-coding the names for combined params case
-    std::string prog_filename = "__model_combined__";
-    std::string param_filename = "__params_combined__";
-    inference_program = paddle::inference::Load(executor,
-                                                *scope,
-                                                dirname + "/" + prog_filename,
-                                                dirname + "/" + param_filename);
-  } else {
-    inference_program = paddle::inference::Load(executor, *scope, dirname);
-  }
-
-  // 3. Get the feed_target_names and fetch_target_names
-  const std::vector<std::string>& feed_target_names =
-      inference_program->GetFeedTargetNames();
-  const std::vector<std::string>& fetch_target_names =
-      inference_program->GetFetchTargetNames();
-
-  // 4. Prepare inputs: set up maps for feed targets
-  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
-  for (size_t i = 0; i < feed_target_names.size(); ++i) {
-    // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
-    feed_targets[feed_target_names[i]] = cpu_feeds[i];
-  }
-
-  // 5. Define Tensor to get the outputs: set up maps for fetch targets
-  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
-  for (size_t i = 0; i < fetch_target_names.size(); ++i) {
-    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
-  }
-
-  // 6. Run the inference program
-  executor.Run(*inference_program, scope, feed_targets, fetch_targets);
-
-  delete scope;
-}
diff --git a/paddle/inference/tests/book/test_inference_image_classification.cc b/paddle/inference/tests/book/test_inference_image_classification.cc
deleted file mode 100644
index 35ff9431e9734bc3d20e1281f9d5d7f3e98f7524..0000000000000000000000000000000000000000
--- a/paddle/inference/tests/book/test_inference_image_classification.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "gflags/gflags.h"
-#include "test_helper.h"
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-TEST(inference, image_classification) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
-  }
-
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-
-  paddle::framework::LoDTensor input;
-  // Use normilized image pixels as input data,
-  // which should be in the range [0.0, 1.0].
-  SetupTensor<float>(
-      input, {1, 3, 32, 32}, static_cast<float>(0), static_cast<float>(1));
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&input);
-
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-
-  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace, float>(
-      dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.dims();
-
-#ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
-
-  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace, float>(
-      dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.dims();
-
-  CheckError<float>(output1, output2);
-#endif
-}
diff --git a/paddle/inference/tests/book/test_inference_label_semantic_roles.cc b/paddle/inference/tests/book/test_inference_label_semantic_roles.cc
deleted file mode 100644
index 1eaf4022a1f27235fdd07e77e294eaba37a14249..0000000000000000000000000000000000000000
--- a/paddle/inference/tests/book/test_inference_label_semantic_roles.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "gflags/gflags.h"
-#include "test_helper.h"
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-TEST(inference, label_semantic_roles) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
-  }
-
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-
-  paddle::framework::LoDTensor word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1,
-      ctx_p2, mark;
-  paddle::framework::LoD lod{{0, 4, 10}};
-
-  SetupLoDTensor(word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(
-      predicate, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(ctx_n2, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(ctx_n1, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(ctx_0, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(ctx_p1, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(ctx_p2, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(mark, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&word);
-  cpu_feeds.push_back(&predicate);
-  cpu_feeds.push_back(&ctx_n2);
-  cpu_feeds.push_back(&ctx_n1);
-  cpu_feeds.push_back(&ctx_0);
-  cpu_feeds.push_back(&ctx_p1);
-  cpu_feeds.push_back(&ctx_p2);
-  cpu_feeds.push_back(&mark);
-
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-
-  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace, float>(
-      dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.lod();
-  LOG(INFO) << output1.dims();
-
-#ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
-
-  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace, float>(
-      dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.lod();
-  LOG(INFO) << output2.dims();
-
-  CheckError<float>(output1, output2);
-#endif
-}
diff --git a/paddle/inference/tests/book/test_inference_recognize_digits.cc b/paddle/inference/tests/book/test_inference_recognize_digits.cc
deleted file mode 100644
index 3a48db7fe08205a3e078592651c739f77d5bf415..0000000000000000000000000000000000000000
--- a/paddle/inference/tests/book/test_inference_recognize_digits.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "gflags/gflags.h"
-#include "test_helper.h"
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-TEST(inference, recognize_digits) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
-  }
-
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-
-  paddle::framework::LoDTensor input;
-  // Use normilized image pixels as input data,
-  // which should be in the range [-1.0, 1.0].
-  SetupTensor<float>(
-      input, {1, 28, 28}, static_cast<float>(-1), static_cast<float>(1));
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&input);
-
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-
-  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace, float>(
-      dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.dims();
-
-#ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
-
-  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace, float>(
-      dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.dims();
-
-  CheckError<float>(output1, output2);
-#endif
-}
-
-TEST(inference, recognize_digits_combine) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
-  }
-
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-
-  paddle::framework::LoDTensor input;
-  // Use normilized image pixels as input data,
-  // which should be in the range [-1.0, 1.0].
-  SetupTensor<float>(
-      input, {1, 28, 28}, static_cast<float>(-1), static_cast<float>(1));
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&input);
-
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-
-  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace, float, true>(
-      dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.dims();
-
-#ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
-
-  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace, float, true>(
-      dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.dims();
-
-  CheckError<float>(output1, output2);
-#endif
-}
diff --git a/paddle/inference/tests/book/test_inference_rnn_encoder_decoder.cc b/paddle/inference/tests/book/test_inference_rnn_encoder_decoder.cc
deleted file mode 100644
index 9bfc0407b7f2732a14e7ac0f319a3d39b9e641bc..0000000000000000000000000000000000000000
--- a/paddle/inference/tests/book/test_inference_rnn_encoder_decoder.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "gflags/gflags.h"
-#include "test_helper.h"
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-TEST(inference, rnn_encoder_decoder) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
-  }
-
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-
-  paddle::framework::LoDTensor word_data, trg_word;
-  paddle::framework::LoD lod{{0, 4, 10}};
-
-  SetupLoDTensor(
-      word_data, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(
-      trg_word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&word_data);
-  cpu_feeds.push_back(&trg_word);
-
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-
-  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace, float>(
-      dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.lod();
-  LOG(INFO) << output1.dims();
-
-#ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
-
-  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace, float>(
-      dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.lod();
-  LOG(INFO) << output2.dims();
-
-  CheckError<float>(output1, output2);
-#endif
-}
diff --git a/paddle/math/float16.h b/paddle/math/float16.h
index efebbce50405018c6b7ce2049f8d55c33680469f..63248d36f9d010547271a95505e5414ff770b8d8 100644
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -22,7 +22,7 @@ limitations under the License. */
 
 #include "unsupported/Eigen/CXX11/Tensor"
 
-#include "paddle/platform/hostdevice.h"
+#include "paddle/fluid/platform/hostdevice.h"
 
 #ifdef __GNUC__
 #define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
diff --git a/paddle/memory/.clang-format b/paddle/memory/.clang-format
deleted file mode 120000
index 7d28cb3924707d39dafe20f4664fb17b5538996c..0000000000000000000000000000000000000000
--- a/paddle/memory/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-../framework/.clang-format
\ No newline at end of file
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
deleted file mode 100644
index 2bc2c06a15702b29c8bdf755978aebe0e6219b4a..0000000000000000000000000000000000000000
--- a/paddle/memory/detail/buddy_allocator.cc
+++ /dev/null
@@ -1,329 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/memory/detail/buddy_allocator.h"
-#include "glog/logging.h"
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
-                               size_t min_chunk_size, size_t max_chunk_size)
-    : min_chunk_size_(min_chunk_size),
-      max_chunk_size_(max_chunk_size),
-      cache_(system_allocator->UseGpu()),
-      system_allocator_(std::move(system_allocator)) {}
-
-BuddyAllocator::~BuddyAllocator() {
-  VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these "
-              "have actually been freed";
-  while (!pool_.empty()) {
-    auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
-    VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
-
-    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
-    cache_.invalidate(block);
-    pool_.erase(pool_.begin());
-  }
-}
-
-inline size_t align(size_t size, size_t alignment) {
-  size_t remaining = size % alignment;
-  return remaining == 0 ? size : size + (alignment - remaining);
-}
-
-void* BuddyAllocator::Alloc(size_t unaligned_size) {
-  // adjust allocation alignment
-  size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_);
-
-  // acquire the allocator lock
-  std::lock_guard<std::mutex> lock(mutex_);
-
-  VLOG(10) << "Allocate " << unaligned_size << " bytes from chunk size "
-           << size;
-
-  // if the allocation is huge, send directly to the system allocator
-  if (size > max_chunk_size_) {
-    VLOG(10) << "Allocate from system allocator.";
-    return SystemAlloc(size);
-  }
-
-  // query and allocate from the existing chunk
-  auto it = FindExistChunk(size);
-
-  // refill the pool if failure
-  if (it == pool_.end()) {
-    it = RefillPool();
-    // if still failure, fail fatally
-    if (it == pool_.end()) {
-      return nullptr;
-    }
-  } else {
-    VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it)
-             << " at address "
-             << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
-  }
-
-  total_used_ += size;
-  total_free_ -= size;
-
-  // split the allocation and return data for use
-  return reinterpret_cast<MemoryBlock*>(SplitToAlloc(it, size))->data();
-}
-
-void BuddyAllocator::Free(void* p) {
-  // Point back to metadata
-  auto block = static_cast<MemoryBlock*>(p)->metadata();
-
-  // Acquire the allocator lock
-  std::lock_guard<std::mutex> lock(mutex_);
-
-  VLOG(10) << "Free from address " << block;
-
-  if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
-    VLOG(10) << "Free directly from system allocator";
-    system_allocator_->Free(block, block->total_size(cache_),
-                            block->index(cache_));
-
-    // Invalidate GPU allocation from cache
-    cache_.invalidate(block);
-
-    return;
-  }
-
-  block->mark_as_free(cache_);
-
-  total_used_ -= block->total_size(cache_);
-  total_free_ += block->total_size(cache_);
-
-  // Trying to merge the right buddy
-  if (block->has_right_buddy(cache_)) {
-    VLOG(10) << "Merging this block " << block << " with its right buddy "
-             << block->right_buddy(cache_);
-
-    auto right_buddy = block->right_buddy(cache_);
-
-    if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
-      // Take away right buddy from pool
-      pool_.erase(IndexSizeAddress(right_buddy->index(cache_),
-                                   right_buddy->total_size(cache_),
-                                   right_buddy));
-
-      // merge its right buddy to the block
-      block->merge(cache_, right_buddy);
-    }
-  }
-
-  // Trying to merge the left buddy
-  if (block->has_left_buddy(cache_)) {
-    VLOG(10) << "Merging this block " << block << " with its left buddy "
-             << block->left_buddy(cache_);
-
-    auto left_buddy = block->left_buddy(cache_);
-
-    if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
-      // Take away right buddy from pool
-      pool_.erase(IndexSizeAddress(left_buddy->index(cache_),
-                                   left_buddy->total_size(cache_), left_buddy));
-
-      // merge the block to its left buddy
-      left_buddy->merge(cache_, block);
-      block = left_buddy;
-    }
-  }
-
-  // Dumping this block into pool
-  VLOG(10) << "Inserting free block (" << block << ", "
-           << block->total_size(cache_) << ")";
-  pool_.insert(
-      IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
-
-  // Clean up if existing too much free memory
-
-  // Prefer freeing fallback allocation first
-  CleanIdleFallBackAlloc();
-
-  // Free normal allocation
-  CleanIdleNormalAlloc();
-}
-
-size_t BuddyAllocator::Used() { return total_used_; }
-
-void* BuddyAllocator::SystemAlloc(size_t size) {
-  size_t index = 0;
-  void* p = system_allocator_->Alloc(index, size);
-
-  VLOG(10) << "Allocated " << p << " from system allocator.";
-
-  if (p == nullptr) return nullptr;
-
-  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index,
-                                     size, nullptr, nullptr);
-
-  return static_cast<MemoryBlock*>(p)->data();
-}
-
-BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
-#ifdef PADDLE_WITH_CUDA
-  if (system_allocator_->UseGpu()) {
-    if ((total_used_ + total_free_) == 0) {
-      // Compute the maximum allocation size for the first allocation.
-      max_chunk_size_ = platform::GpuMaxChunkSize();
-    }
-  }
-#endif
-
-  // Allocate a new maximum sized block
-  size_t index = 0;
-  void* p = system_allocator_->Alloc(index, max_chunk_size_);
-
-  if (p == nullptr) return pool_.end();
-
-  VLOG(10) << "Creating and inserting new block " << p
-           << " from system allocator";
-
-  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
-                                     max_chunk_size_, nullptr, nullptr);
-
-  // gpu fallback allocation
-  if (system_allocator_->UseGpu() &&
-      static_cast<MemoryBlock*>(p)->index(cache_) == 1) {
-    fallback_alloc_count_++;
-  }
-
-  total_free_ += max_chunk_size_;
-
-  // dump the block into pool
-  return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
-}
-
-BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
-  size_t index = 0;
-
-  while (1) {
-    auto it = pool_.lower_bound(IndexSizeAddress(index, size, nullptr));
-
-    // no match chunk memory
-    if (it == pool_.end()) return it;
-
-    if (std::get<0>(*it) > index) {
-      // find suitable one
-      if (std::get<1>(*it) >= size) {
-        return it;
-      }
-      // update and continue
-      index = std::get<0>(*it);
-      continue;
-    }
-    return it;
-  }
-}
-
-void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
-                                   size_t size) {
-  auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
-  pool_.erase(it);
-
-  VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_)
-           << ") into";
-  block->split(cache_, size);
-
-  VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_)
-           << ")";
-  block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
-
-  // the rest of memory if exist
-  if (block->has_right_buddy(cache_)) {
-    if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
-      VLOG(10) << "Insert right block (" << block->right_buddy(cache_) << ", "
-               << block->right_buddy(cache_)->total_size(cache_) << ")";
-
-      pool_.insert(
-          IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
-                           block->right_buddy(cache_)->total_size(cache_),
-                           block->right_buddy(cache_)));
-    }
-  }
-
-  return block;
-}
-
-void BuddyAllocator::CleanIdleFallBackAlloc() {
-  // If fallback allocation does not exist, return directly
-  if (!fallback_alloc_count_) return;
-
-  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
-    // If free memory block less than max_chunk_size_, return directly
-    if (std::get<1>(*pool) < max_chunk_size_) return;
-
-    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
-
-    // If no GPU fallback allocator, return
-    if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
-      return;
-    }
-
-    VLOG(10) << "Return block " << block << " to fallback allocator.";
-
-    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
-    cache_.invalidate(block);
-
-    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
-
-    total_free_ -= max_chunk_size_;
-    fallback_alloc_count_--;
-
-    // If no fall allocation exists, return directly
-    if (!fallback_alloc_count_) return;
-  }
-}
-
-void BuddyAllocator::CleanIdleNormalAlloc() {
-  auto shall_free_alloc = [&]() -> bool {
-    // free all fallback allocations
-    if (fallback_alloc_count_ > 0) {
-      return true;
-    }
-    // keep 2x overhead if we haven't fallen back
-    if ((total_used_ + max_chunk_size_) * 2 < total_free_) {
-      return true;
-    }
-    return false;
-  };
-
-  if (!shall_free_alloc()) return;
-
-  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
-    // If free memory block less than max_chunk_size_, return directly
-    if (std::get<1>(*pool) < max_chunk_size_) return;
-
-    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
-
-    VLOG(10) << "Return block " << block << " to base allocator.";
-
-    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
-    cache_.invalidate(block);
-
-    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
-
-    total_free_ -= max_chunk_size_;
-
-    if (!shall_free_alloc()) return;
-  }
-}
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
deleted file mode 100644
index 4e0135dd655d04b7f99722a2159795738c1b29c7..0000000000000000000000000000000000000000
--- a/paddle/memory/detail/buddy_allocator.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/memory/detail/meta_cache.h"
-#include "paddle/memory/detail/meta_data.h"
-#include "paddle/memory/detail/system_allocator.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/cpu_info.h"
-#include "paddle/platform/gpu_info.h"
-
-#include <mutex>
-#include <set>
-#include <unordered_map>
-#include <vector>
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-class BuddyAllocator {
- public:
-  BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size,
-                 size_t max_chunk_size);
-
-  ~BuddyAllocator();
-
- public:
-  void* Alloc(size_t unaligned_size);
-  void Free(void* ptr);
-  size_t Used();
-
- public:
-  // Disable copy and assignment
-  BuddyAllocator(const BuddyAllocator&) = delete;
-  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
-
- private:
-  // Tuple (allocator index, memory size, memory address)
-  using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
-  // Each element in PoolSet is a free allocation
-  using PoolSet = std::set<IndexSizeAddress>;
-
-  /*! \brief Allocate fixed-size memory from system */
-  void* SystemAlloc(size_t size);
-
-  /*! \brief If existing chunks are not suitable, refill pool */
-  PoolSet::iterator RefillPool();
-
-  /**
-   *  \brief   Find the suitable chunk from existing pool and split
-   *           it to left and right buddies
-   *
-   *  \param   it     the iterator of pool list
-   *  \param   size   the size of allocation
-   *
-   *  \return  the left buddy address
-   */
-  void* SplitToAlloc(PoolSet::iterator it, size_t size);
-
-  /*! \brief Find the existing chunk which used to allocation */
-  PoolSet::iterator FindExistChunk(size_t size);
-
-  /*! \brief Clean idle fallback allocation */
-  void CleanIdleFallBackAlloc();
-
-  /*! \brief Clean idle normal allocation */
-  void CleanIdleNormalAlloc();
-
- private:
-  size_t total_used_ = 0;  // the total size of used memory
-  size_t total_free_ = 0;  // the total size of free memory
-
-  size_t min_chunk_size_;  // the minimum size of each chunk
-  size_t max_chunk_size_;  // the maximum size of each chunk
-
- private:
-  /**
-   * \brief A list of free allocation
-   *
-   * \note  Only store free chunk memory in pool
-   */
-  PoolSet pool_;
-
-  /*! Record fallback allocation count for auto-scaling */
-  size_t fallback_alloc_count_ = 0;
-
- private:
-  /*! Unify the metadata format between GPU and CPU allocations */
-  MetadataCache cache_;
-
- private:
-  /*! Allocate CPU/GPU memory from system */
-  SystemAllocator* system_allocator_;
-  std::mutex mutex_;
-};
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc
deleted file mode 100644
index f50eceba096477d7b2f50f7c406770c8e9595332..0000000000000000000000000000000000000000
--- a/paddle/memory/detail/memory_block.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/memory/detail/memory_block.h"
-#include "paddle/memory/detail/meta_cache.h"
-#include "paddle/memory/detail/meta_data.h"
-#include "paddle/platform/assert.h"
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size,
-                       void* left_buddy, void* right_buddy) {
-  cache.store(this, Metadata(t, index, size - sizeof(Metadata), size,
-                             static_cast<MemoryBlock*>(left_buddy),
-                             static_cast<MemoryBlock*>(right_buddy)));
-}
-
-MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const {
-  return cache.load(this).type;
-}
-
-size_t MemoryBlock::size(MetadataCache& cache) const {
-  return cache.load(this).size;
-}
-
-size_t MemoryBlock::total_size(MetadataCache& cache) const {
-  return cache.load(this).total_size;
-}
-
-MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const {
-  return cache.load(this).left_buddy;
-}
-
-MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const {
-  return cache.load(this).right_buddy;
-}
-
-void MemoryBlock::split(MetadataCache& cache, size_t size) {
-  // make sure the split fits
-  PADDLE_ASSERT(total_size(cache) >= size);
-
-  // bail out if there is no room for another partition
-  if (total_size(cache) - size <= sizeof(Metadata)) {
-    return;
-  }
-
-  // find the position of the split
-  void* right_partition = reinterpret_cast<uint8_t*>(this) + size;
-
-  size_t remaining_size = total_size(cache) - size;
-
-  // Add the new block as a buddy
-  auto metadata = cache.load(this);
-
-  // Write the metadata for the new block
-  auto new_block_right_buddy = metadata.right_buddy;
-
-  cache.store(
-      static_cast<MemoryBlock*>(right_partition),
-      Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata),
-               remaining_size, this, new_block_right_buddy));
-
-  metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
-  metadata.size = size - sizeof(Metadata);
-  metadata.total_size = size;
-
-  cache.store(this, metadata);
-
-  // Write metadata for the new block's right buddy
-  if (new_block_right_buddy != nullptr) {
-    auto buddy_metadata = cache.load(new_block_right_buddy);
-
-    buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition);
-
-    cache.store(new_block_right_buddy, buddy_metadata);
-  }
-}
-
-void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
-  // only free blocks can be merged
-  PADDLE_ASSERT(type(cache) == FREE_CHUNK);
-  PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK);
-
-  auto metadata = cache.load(this);
-
-  // link this->buddy's buddy
-  metadata.right_buddy = right_buddy->right_buddy(cache);
-
-  // link buddy's buddy -> this
-  if (metadata.right_buddy != nullptr) {
-    auto buddy_metadata = cache.load(metadata.right_buddy);
-
-    buddy_metadata.left_buddy = this;
-
-    cache.store(metadata.right_buddy, buddy_metadata);
-  }
-
-  metadata.size += right_buddy->total_size(cache);
-  metadata.total_size += right_buddy->total_size(cache);
-
-  cache.store(this, metadata);
-  cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
-}
-
-void MemoryBlock::mark_as_free(MetadataCache& cache) {
-  // check for double free or corruption
-  PADDLE_ASSERT(type(cache) != FREE_CHUNK);
-  PADDLE_ASSERT(type(cache) != INVALID_CHUNK);
-
-  set_type(cache, FREE_CHUNK);
-}
-
-void MemoryBlock::set_type(MetadataCache& cache, Type t) {
-  auto metadata = cache.load(this);
-
-  metadata.type = t;
-
-  cache.store(this, metadata);
-}
-
-bool MemoryBlock::has_left_buddy(MetadataCache& cache) const {
-  return left_buddy(cache) != nullptr;
-}
-
-bool MemoryBlock::has_right_buddy(MetadataCache& cache) const {
-  return right_buddy(cache) != nullptr;
-}
-
-size_t MemoryBlock::index(MetadataCache& cache) const {
-  return cache.load(this).index;
-}
-
-void* MemoryBlock::data() const {
-  return const_cast<Metadata*>(reinterpret_cast<const Metadata*>(this)) + 1;
-}
-
-MemoryBlock* MemoryBlock::metadata() const {
-  return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
-      reinterpret_cast<const Metadata*>(this) - 1));
-}
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/memory/detail/meta_cache.cc b/paddle/memory/detail/meta_cache.cc
deleted file mode 100644
index 2bacca75108f9f80e7aa291fcb4fd66112201394..0000000000000000000000000000000000000000
--- a/paddle/memory/detail/meta_cache.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/memory/detail/meta_cache.h"
-#include "glog/logging.h"
-#include "paddle/memory/detail/memory_block.h"
-#include "paddle/platform/assert.h"
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
-
-Metadata MetadataCache::load(const MemoryBlock* block) {
-  if (uses_gpu_) {
-    auto existing_metadata = cache_.find(block);
-    PADDLE_ASSERT(existing_metadata->second.check_guards());
-    return existing_metadata->second;
-  } else {
-    auto* meta = reinterpret_cast<const Metadata*>(block);
-    VLOG(10) << "Load MetaData type=" << meta->type;
-    PADDLE_ASSERT(meta->check_guards());
-    return *reinterpret_cast<const Metadata*>(block);
-  }
-}
-
-void MetadataCache::store(MemoryBlock* block,
-                          const Metadata& original_metadata) {
-  auto metadata = original_metadata;
-
-  metadata.update_guards();
-
-  if (uses_gpu_) {
-    cache_[block] = metadata;
-  } else {
-    *reinterpret_cast<Metadata*>(block) = metadata;
-  }
-}
-
-void MetadataCache::invalidate(MemoryBlock* block) {
-  if (uses_gpu_) {
-    cache_.erase(block);
-  }
-}
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/memory/detail/meta_cache.h b/paddle/memory/detail/meta_cache.h
deleted file mode 100644
index db8ffd49ae30cf72ca691894af2df08a7106d02f..0000000000000000000000000000000000000000
--- a/paddle/memory/detail/meta_cache.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/memory/detail/memory_block.h"
-#include "paddle/memory/detail/meta_data.h"
-
-#include <unordered_map>
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-/**
- *  \brief A cache for accessing memory block meta-data that may be expensive
- *         to access directly.
- *
- *  \note  This class exists to unify the metadata format between GPU and CPU
- *         allocations. It should be removed when the CPU can access all GPU
- *         allocations directly via UVM.
- */
-class MetadataCache {
- public:
-  explicit MetadataCache(bool uses_gpu);
-
- public:
-  /*! \brief Load the associated metadata for the specified memory block. */
-  Metadata load(const MemoryBlock* memory_block);
-
-  /*! \brief Store the associated metadata for the specified memory block. */
-  void store(MemoryBlock* memory_block, const Metadata& meta_data);
-
-  /*! \brief Indicate that the specified metadata will no longer be used. */
-  void invalidate(MemoryBlock* memory_block);
-
- public:
-  MetadataCache(const MetadataCache&) = delete;
-  MetadataCache& operator=(const MetadataCache&) = delete;
-
- private:
-  bool uses_gpu_;
-
- private:
-  typedef std::unordered_map<const MemoryBlock*, Metadata> MetadataMap;
-
- private:
-  MetadataMap cache_;
-};
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/memory/detail/meta_data.cc b/paddle/memory/detail/meta_data.cc
deleted file mode 100644
index dc57d4d2376ab1cfe6fb49c92af4591b3972a53a..0000000000000000000000000000000000000000
--- a/paddle/memory/detail/meta_data.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/memory/detail/meta_data.h"
-
-#include <functional>
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
-                   MemoryBlock* l, MemoryBlock* r)
-    : type(t),
-      index(i),
-      size(s),
-      total_size(ts),
-      left_buddy(l),
-      right_buddy(r) {}
-
-Metadata::Metadata()
-    : type(MemoryBlock::INVALID_CHUNK),
-      index(0),
-      size(0),
-      total_size(0),
-      left_buddy(nullptr),
-      right_buddy(nullptr) {}
-
-template <class T>
-inline void hash_combine(std::size_t& seed, const T& v) {
-  std::hash<T> hasher;
-  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-}
-
-inline size_t hash(const Metadata* metadata, size_t initial_seed) {
-  size_t seed = initial_seed;
-
-  hash_combine(seed, (size_t)metadata->type);
-  hash_combine(seed, metadata->index);
-  hash_combine(seed, metadata->size);
-  hash_combine(seed, metadata->total_size);
-  hash_combine(seed, metadata->left_buddy);
-  hash_combine(seed, metadata->right_buddy);
-
-  return seed;
-}
-
-void Metadata::update_guards() {
-  guard_begin = hash(this, 1);
-  guard_end = hash(this, 2);
-}
-
-bool Metadata::check_guards() const {
-  return guard_begin == hash(this, 1) && guard_end == hash(this, 2);
-}
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/memory/detail/meta_data.h b/paddle/memory/detail/meta_data.h
deleted file mode 100644
index 6b83c42eb851f0487bed6d625d848cf90db00929..0000000000000000000000000000000000000000
--- a/paddle/memory/detail/meta_data.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/memory/detail/memory_block.h"
-
-#include <stddef.h>
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-class Metadata {
- public:
-  Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
-           MemoryBlock* r);
-  Metadata();
-
- public:
-  /*! \brief Update the guards when metadata is changed */
-  void update_guards();
-
-  /*! \brief Check consistency to previous modification */
-  bool check_guards() const;
-
- public:
-  // TODO(gangliao): compress this
-  // clang-format off
-  size_t            guard_begin = 0;
-  MemoryBlock::Type type        = MemoryBlock::INVALID_CHUNK;
-  size_t            index       = 0;
-  size_t            size        = 0;
-  size_t            total_size  = 0;
-  MemoryBlock*      left_buddy  = nullptr;
-  MemoryBlock*      right_buddy = nullptr;
-  size_t            guard_end   = 0;
-  // clang-format on
-};
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
deleted file mode 100644
index 509250debc2b2fd2e87078ab5f233ae2db6fd898..0000000000000000000000000000000000000000
--- a/paddle/memory/detail/system_allocator.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/memory/detail/system_allocator.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/gpu_info.h"
-
-#include <stdlib.h>    // for malloc and free
-#include <sys/mman.h>  // for mlock and munlock
-#include <algorithm>   // for std::max
-
-#include "gflags/gflags.h"
-
-// If use_pinned_memory is true, CPUAllocator calls mlock, which
-// returns pinned and locked memory as staging areas for data exchange
-// between host and device.  Allocates too much would reduce the amount
-// of memory available to the system for paging.  So, by default, we
-// should set false to use_pinned_memory.
-DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
-DECLARE_double(fraction_of_gpu_memory_to_use);
-namespace paddle {
-namespace memory {
-namespace detail {
-
-void* CPUAllocator::Alloc(size_t& index, size_t size) {
-  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
-  // malloc might not return nullptr if size is zero, but the returned
-  // pointer shall not be dereferenced -- so we make it nullptr.
-  if (size <= 0) return nullptr;
-
-  index = 0;  // unlock memory
-
-  void* p;
-
-#ifdef PADDLE_WITH_MKLDNN
-  // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
-  // memory alignment
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0);
-#else
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0);
-#endif
-  PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size);
-
-  if (p != nullptr) {
-    if (FLAGS_use_pinned_memory) {
-      index = 1;
-      mlock(p, size);  // lock memory
-    }
-  }
-
-  return p;
-}
-
-void CPUAllocator::Free(void* p, size_t size, size_t index) {
-  if (p != nullptr && index == 1) {
-    munlock(p, size);
-  }
-  free(p);
-}
-
-bool CPUAllocator::UseGpu() const { return false; }
-
-#ifdef PADDLE_WITH_CUDA
-
-void* GPUAllocator::Alloc(size_t& index, size_t size) {
-  // CUDA documentation doesn't explain if cudaMalloc returns nullptr
-  // if size is 0.  We just make sure it does.
-  if (size <= 0) return nullptr;
-  void* p;
-  cudaError_t result = cudaMalloc(&p, size);
-  if (result == cudaSuccess) {
-    index = 0;
-    gpu_alloc_size_ += size;
-    return p;
-  } else {
-    LOG(WARNING)
-        << "Cannot malloc " << size / 1024.0 / 1024.0
-        << " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use "
-           "environment variable to a lower value. Current value is "
-        << FLAGS_fraction_of_gpu_memory_to_use;
-    return nullptr;
-  }
-}
-
-void GPUAllocator::Free(void* p, size_t size, size_t index) {
-  cudaError_t err;
-
-  if (index == 0) {
-    PADDLE_ASSERT(gpu_alloc_size_ >= size);
-    gpu_alloc_size_ -= size;
-    err = cudaFree(p);
-  } else {
-    PADDLE_ASSERT(fallback_alloc_size_ >= size);
-    fallback_alloc_size_ -= size;
-    err = cudaFreeHost(p);
-  }
-
-  // Purposefully allow cudaErrorCudartUnloading, because
-  // that is returned if you ever call cudaFree after the
-  // driver has already shutdown. This happens only if the
-  // process is terminating, in which case we don't care if
-  // cudaFree succeeds.
-  if (err != cudaErrorCudartUnloading) {
-    PADDLE_ENFORCE(err, "cudaFree{Host} failed in GPUAllocator::Free.");
-  }
-}
-
-bool GPUAllocator::UseGpu() const { return true; }
-
-#endif
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
deleted file mode 100644
index 6a8558937bf0c924e5f48605ff066e2789fd59b6..0000000000000000000000000000000000000000
--- a/paddle/memory/detail/system_allocator_test.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/memory/detail/system_allocator.h"
-
-#include <memory>
-#include <vector>
-
-#include "gflags/gflags.h"
-#include "gtest/gtest.h"
-
-DECLARE_bool(use_pinned_memory);
-
-void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
-  bool freed = false;
-  {
-    size_t index;
-    void* p = a.Alloc(index, size);
-    if (size > 0) {
-      EXPECT_NE(p, nullptr);
-    } else {
-      EXPECT_EQ(p, nullptr);
-    }
-
-    int* i = static_cast<int*>(p);
-    std::shared_ptr<int> ptr(i, [&](void* p) {
-      freed = true;
-      a.Free(p, size, index);
-    });
-  }
-  EXPECT_TRUE(freed);
-}
-
-TEST(CPUAllocator, NoLockMem) {
-  FLAGS_use_pinned_memory = false;
-  paddle::memory::detail::CPUAllocator a;
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
-}
-
-TEST(CPUAllocator, LockMem) {
-  FLAGS_use_pinned_memory = true;
-  paddle::memory::detail::CPUAllocator a;
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(GPUAllocator, Alloc) {
-  paddle::memory::detail::GPUAllocator a;
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
-}
-#endif
diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc
deleted file mode 100644
index b46141aafd7146bd3def12d86108c10f1f143d20..0000000000000000000000000000000000000000
--- a/paddle/memory/memcpy.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/memory/memcpy.h"
-
-#include <cstring>  // for memcpy
-
-namespace paddle {
-namespace memory {
-
-template <>
-void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
-                                                  platform::CPUPlace,
-                                                  const void* src, size_t num) {
-  std::memcpy(dst, src, num);
-}
-
-#ifdef PADDLE_WITH_CUDA
-template <>
-void Copy<platform::CPUPlace, platform::CUDAPlace>(
-    platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
-    const void* src, size_t num, cudaStream_t stream) {
-  platform::SetDeviceId(src_place.device);
-  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
-}
-
-template <>
-void Copy<platform::CUDAPlace, platform::CPUPlace>(
-    platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
-    const void* src, size_t num, cudaStream_t stream) {
-  platform::SetDeviceId(dst_place.device);
-  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
-}
-
-template <>
-void Copy<platform::CUDAPlace, platform::CUDAPlace>(
-    platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
-    const void* src, size_t num, cudaStream_t stream) {
-  if (dst_place == src_place) {
-    platform::SetDeviceId(src_place.device);
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
-  } else {
-    platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
-                            stream);
-  }
-}
-
-#endif
-
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/memory/memcpy.h b/paddle/memory/memcpy.h
deleted file mode 100644
index 29c20e18601b71bac5201df8ff0c7ce0bed702dc..0000000000000000000000000000000000000000
--- a/paddle/memory/memcpy.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/platform/gpu_info.h"
-#include "paddle/platform/place.h"
-
-namespace paddle {
-namespace memory {
-
-/**
- * \brief   Copy memory from one place to another place.
- *
- * \param[in]  DstPlace Destination allocation place (CPU).
- * \param[in]  dst      Destination memory address.
- * \param[in]  SrcPlace Source allocation place (CPU).
- * \param[in]  src      Source memory address.
- * \param[in]  num      memory size in bytes to copy.
- *
- */
-template <typename DstPlace, typename SrcPlace>
-void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
-
-#ifdef PADDLE_WITH_CUDA
-
-/**
- * \brief   Copy memory from one place to another place.
- *
- * \param[in]  DstPlace Destination allocation place (CPU or GPU).
- * \param[in]  dst      Destination memory address.
- * \param[in]  SrcPlace Source allocation place (CPU or GPU).
- * \param[in]  src      Source memory address.
- * \param[in]  num      memory size in bytes to copy.
- * \param[in]  stream   CUDA stream.
- *
- * \note    For GPU memory copy, CUDA stream need to be specified
- *          for asynchronously memory copy.
- *
- */
-template <typename DstPlace, typename SrcPlace>
-void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
-          cudaStream_t stream);
-
-#endif
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
deleted file mode 100644
index 1a73a94567e45b81a0b148965a834f03c7407ffe..0000000000000000000000000000000000000000
--- a/paddle/memory/memory.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/memory/memory.h"
-
-#include "glog/logging.h"
-
-#include "paddle/memory/detail/buddy_allocator.h"
-#include "paddle/memory/detail/system_allocator.h"
-#include "paddle/platform/gpu_info.h"
-
-DECLARE_double(fraction_of_gpu_memory_to_use);
-
-namespace paddle {
-namespace memory {
-
-using BuddyAllocator = detail::BuddyAllocator;
-
-BuddyAllocator* GetCPUBuddyAllocator() {
-  static detail::BuddyAllocator* a = nullptr;
-  if (a == nullptr) {
-    a = new detail::BuddyAllocator(new detail::CPUAllocator,
-                                   platform::CpuMinChunkSize(),
-                                   platform::CpuMaxChunkSize());
-  }
-  return a;
-}
-
-template <>
-void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
-  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
-  void* p = GetCPUBuddyAllocator()->Alloc(size);
-  VLOG(10) << "  pointer=" << p;
-  return p;
-}
-
-template <>
-void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
-  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
-  GetCPUBuddyAllocator()->Free(p);
-}
-
-template <>
-size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
-  return GetCPUBuddyAllocator()->Used();
-}
-
-#ifdef PADDLE_WITH_CUDA
-
-BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator** as = NULL;
-  if (as == NULL) {
-    int gpu_num = platform::GetCUDADeviceCount();
-    as = new BuddyAllocator*[gpu_num];
-    for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] = nullptr;
-    }
-  }
-  platform::SetDeviceId(gpu_id);
-  if (!as[gpu_id]) {
-    as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator,
-                                    platform::GpuMinChunkSize(),
-                                    platform::GpuMaxChunkSize());
-    VLOG(10) << "\n\nNOTE: each GPU device use "
-             << FLAGS_fraction_of_gpu_memory_to_use * 100
-             << "% of GPU memory.\n"
-             << "You can set GFlags environment variable '"
-             << "FLAGS_fraction_of_gpu_memory_to_use"
-             << "' to change the fraction of GPU usage.\n\n";
-  }
-  return as[gpu_id];
-}
-
-template <>
-size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
-  return GetGPUBuddyAllocator(place.device)->Used();
-}
-
-template <>
-void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
-  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
-  auto* ptr = buddy_allocator->Alloc(size);
-  if (ptr == nullptr) {
-    int cur_dev = platform::GetCurrentDeviceId();
-    platform::SetDeviceId(place.device);
-    size_t avail, total;
-    platform::GpuMemoryUsage(avail, total);
-    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
-                 << place.device << ", available " << avail << " bytes";
-    LOG(WARNING) << "total " << total;
-    LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
-    LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
-    LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
-    platform::SetDeviceId(cur_dev);
-  }
-  return ptr;
-}
-
-template <>
-void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
-  GetGPUBuddyAllocator(place.device)->Free(p);
-}
-
-#endif
-
-size_t Usage::operator()(const platform::CPUPlace& cpu) const {
-  return Used(cpu);
-}
-
-size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
-#ifdef PADDLE_WITH_CUDA
-  return Used(gpu);
-#else
-  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
-#endif
-}
-
-size_t memory_usage(const platform::Place& p) {
-  return boost::apply_visitor(Usage(), p);
-}
-
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
deleted file mode 100644
index 30ed68c6e0ea95d206658d16684800e169ededc5..0000000000000000000000000000000000000000
--- a/paddle/memory/memory.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/platform/place.h"
-
-namespace paddle {
-namespace memory {
-
-/**
- * \brief   Allocate memory block in one place.
- *
- * \param[in]  place  Allocation place (CPU or GPU).
- * \param[in]  size   Allocation size.
- *
- * \return  Allocated memory block address.
- *
- * \note    If return nullptr, it indicates memory allocation failed
- *          because insufficient memory in current system. When Alloc
- *          function is invoked, you must check the returned memory
- *          address is valid or not.
- */
-template <typename Place>
-void* Alloc(Place place, size_t size);
-
-/**
- * \brief   Free memory block in one place.
- *
- * \param[in]  place  Allocation place (CPU or GPU).
- * \param[in]  ptr    Memory block address to free.
- *
- */
-template <typename Place>
-void Free(Place place, void* ptr);
-
-/**
- * \brief   Total size of used memory in one place.
- *
- * \param[in]  place  Allocation place (CPU or GPU).
- *
- */
-template <typename Place>
-size_t Used(Place place);
-
-struct Usage : public boost::static_visitor<size_t> {
-  size_t operator()(const platform::CPUPlace& cpu) const;
-  size_t operator()(const platform::CUDAPlace& gpu) const;
-};
-
-size_t memory_usage(const platform::Place& p);
-
-/**
- * \brief   Free memory block in one place.
- *
- * \note    In some cases, custom deleter is used to
- *          deallocate the memory automatically for
- *          std::unique_ptr<T> in tensor.h.
- *
- */
-template <typename T, typename Place>
-class PODDeleter {
-  static_assert(std::is_pod<T>::value, "T must be POD");
-
- public:
-  explicit PODDeleter(Place place) : place_(place) {}
-  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
-
- private:
-  Place place_;
-};
-
-/**
- * \brief   Free memory block in one place does not meet POD
- *
- * \note    In some cases, custom deleter is used to
- *          deallocate the memory automatically for
- *          std::unique_ptr<T> in tensor.h.
- *
- */
-template <typename T, typename Place>
-class PlainDeleter {
- public:
-  explicit PlainDeleter(Place place) : place_(place) {}
-  void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
-
- private:
-  Place place_;
-};
-
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
deleted file mode 100644
index b3f699f9b7eff54c06ff69023db082380c83467a..0000000000000000000000000000000000000000
--- a/paddle/memory/memory_test.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/memory/memory.h"
-#include "paddle/memory/detail/memory_block.h"
-#include "paddle/memory/detail/meta_data.h"
-
-#include "paddle/platform/cpu_info.h"
-#include "paddle/platform/gpu_info.h"
-#include "paddle/platform/place.h"
-
-#include <gtest/gtest.h>
-#include <unordered_map>
-
-inline bool is_aligned(void const *p) {
-  return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
-}
-
-size_t align(size_t size, paddle::platform::CPUPlace place) {
-  size += sizeof(paddle::memory::detail::Metadata);
-  size_t alignment = paddle::platform::CpuMinChunkSize();
-  size_t remaining = size % alignment;
-  return remaining == 0 ? size : size + (alignment - remaining);
-}
-
-TEST(BuddyAllocator, CPUAllocation) {
-  void *p = nullptr;
-
-  EXPECT_EQ(p, nullptr);
-
-  paddle::platform::CPUPlace cpu;
-  p = paddle::memory::Alloc(cpu, 4096);
-
-  EXPECT_NE(p, nullptr);
-
-  paddle::platform::Place place = cpu;
-  EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
-
-  paddle::memory::Free(cpu, p);
-}
-
-TEST(BuddyAllocator, CPUMultAlloc) {
-  paddle::platform::CPUPlace cpu;
-
-  std::unordered_map<void *, size_t> ps;
-
-  size_t total_size = paddle::memory::Used(cpu);
-  EXPECT_EQ(total_size, 0UL);
-
-  for (auto size :
-       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
-    ps[paddle::memory::Alloc(cpu, size)] = size;
-
-    // Buddy Allocator doesn't manage too large memory chunk
-    if (paddle::memory::Used(cpu) == total_size) continue;
-
-    size_t aligned_size = align(size, cpu);
-    total_size += aligned_size;
-    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
-  }
-
-  for (auto p : ps) {
-    EXPECT_EQ(is_aligned(p.first), true);
-    paddle::memory::Free(cpu, p.first);
-
-    // Buddy Allocator doesn't manage too large memory chunk
-    if (paddle::memory::Used(cpu) == total_size) continue;
-
-    size_t aligned_size = align(p.second, cpu);
-    total_size -= aligned_size;
-    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
-  }
-}
-
-#ifdef PADDLE_WITH_CUDA
-
-size_t align(size_t size, paddle::platform::CUDAPlace place) {
-  size += sizeof(paddle::memory::detail::Metadata);
-  size_t alignment = paddle::platform::GpuMinChunkSize();
-  size_t remaining = size % alignment;
-  return remaining == 0 ? size : size + (alignment - remaining);
-}
-
-TEST(BuddyAllocator, GPUAllocation) {
-  void *p = nullptr;
-
-  EXPECT_EQ(p, nullptr);
-
-  paddle::platform::CUDAPlace gpu(0);
-  p = paddle::memory::Alloc(gpu, 4096);
-
-  EXPECT_NE(p, nullptr);
-
-  paddle::platform::Place place = gpu;
-  EXPECT_EQ(paddle::memory::Used(gpu), paddle::memory::memory_usage(place));
-
-  paddle::memory::Free(gpu, p);
-}
-
-TEST(BuddyAllocator, GPUMultAlloc) {
-  paddle::platform::CUDAPlace gpu;
-
-  std::unordered_map<void *, size_t> ps;
-
-  size_t total_size = paddle::memory::Used(gpu);
-  EXPECT_EQ(total_size, 0UL);
-
-  for (auto size :
-       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
-    ps[paddle::memory::Alloc(gpu, size)] = size;
-
-    // Buddy Allocator doesn't manage too large memory chunk
-    if (paddle::memory::Used(gpu) == total_size) continue;
-
-    size_t aligned_size = align(size, gpu);
-    total_size += aligned_size;
-    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
-  }
-
-  for (auto p : ps) {
-    EXPECT_EQ(is_aligned(p.first), true);
-    paddle::memory::Free(gpu, p.first);
-
-    // Buddy Allocator doesn't manage too large memory chunk
-    if (paddle::memory::Used(gpu) == total_size) continue;
-
-    size_t aligned_size = align(p.second, gpu);
-    total_size -= aligned_size;
-    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
-  }
-}
-
-#endif
diff --git a/paddle/operators/.clang-format b/paddle/operators/.clang-format
deleted file mode 120000
index 7d28cb3924707d39dafe20f4664fb17b5538996c..0000000000000000000000000000000000000000
--- a/paddle/operators/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-../framework/.clang-format
\ No newline at end of file
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
deleted file mode 100644
index 25bb7187d36c5f696890ef72d4cb91bce94fddf8..0000000000000000000000000000000000000000
--- a/paddle/operators/CMakeLists.txt
+++ /dev/null
@@ -1,203 +0,0 @@
-file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
-string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
-set(DEPS_OPS "")
-set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/pybind/pybind.h)
-file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
-function(op_library TARGET)
-    # op_library is a function to create op library. The interface is same as
-    # cc_library. But it handle split GPU/CPU code and link some common library
-    # for ops.
-    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
-    set(cc_srcs)
-    set(cu_srcs)
-    set(cu_cc_srcs)
-    set(op_common_deps operator op_registry math_function)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    set(pybind_flag 0)
-    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN})
-
-    list(LENGTH op_library_SRCS op_library_SRCS_len)
-    if (${op_library_SRCS_len} EQUAL 0)
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
-            list(APPEND cc_srcs ${TARGET}.cc)
-        endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
-            list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
-        endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
-            list(APPEND cu_srcs ${TARGET}.cu)
-        endif()
-    else()
-        foreach(src ${op_library_SRCS})
-            if (${src} MATCHES ".*\\.cu$")
-                list(APPEND cu_srcs ${src})
-            elseif(${src} MATCHES ".*\\.cu.cc$")
-                list(APPEND cu_cc_srcs ${src})
-            elseif(${src} MATCHES ".*\\.cc$")
-                list(APPEND cc_srcs ${src})
-            else()
-                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
-            endif()
-        endforeach()
-    endif()
-
-    list(LENGTH cc_srcs cc_srcs_len)
-    if (${cc_srcs_len} EQUAL 0)
-        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
-    endif()
-
-    list(LENGTH op_library_DEPS op_library_DEPS_len)
-    if (${op_library_DEPS_len} GREATER 0)
-        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
-    endif()
-    if (WITH_GPU)
-        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
-                ${op_common_deps})
-    else()
-        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
-                ${op_common_deps})
-    endif()
-
-    # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "create_reader_op")
-        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
-            set(pybind_flag 1)
-        endif()
-    endforeach()
-
-    # The registration of USE_OP, please refer to paddle/framework/op_registry.h.
-    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
-    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
-    file(READ ${TARGET}.cc TARGET_CONTENT)
-    string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}")
-    string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}")
-    if (one_register STREQUAL "")
-        string(REPLACE "_op" "" TARGET "${TARGET}")
-    else ()
-        string(REPLACE "REGISTER_OP(" "" TARGET "${one_register}")
-        string(REPLACE "," "" TARGET "${TARGET}")
-    endif()
-
-    # pybind USE_NO_KERNEL_OP
-    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
-    string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
-    string(REPLACE "_op" "" TARGET "${TARGET}")
-    if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
-        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
-        set(pybind_flag 1)
-    endif()
-
-    # pybind USE_CPU_ONLY_OP
-    list(LENGTH cu_srcs cu_srcs_len)
-    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
-    if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
-        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
-        set(pybind_flag 1)
-    endif()
-
-    # pybind USE_OP
-    if (${pybind_flag} EQUAL 0)
-        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
-    endif()
-endfunction()
-
-add_subdirectory(math)
-add_subdirectory(nccl)
-
-if(WITH_GPU)
-    op_library(nccl_op DEPS nccl_common)
-    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
-else()
-    set(DEPS_OPS ${DEPS_OPS} nccl_op)
-endif()
-
-if(WITH_DISTRIBUTE)
-    add_subdirectory(detail)
-    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
-    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
-else()
-    set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op)
-endif()
-
-op_library(cond_op DEPS framework_proto tensor net_op)
-op_library(cross_entropy_op DEPS cross_entropy)
-op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
-op_library(softmax_op DEPS softmax)
-op_library(detection_output_op DEPS softmax)
-op_library(sequence_softmax_op DEPS softmax)
-op_library(sum_op DEPS selected_rows_functor)
-op_library(sgd_op DEPS selected_rows_functor)
-op_library(print_op DEPS lod_tensor)
-op_library(adagrad_op DEPS selected_rows_functor)
-op_library(maxout_op DEPS maxouting)
-op_library(unpool_op DEPS unpooling)
-op_library(pool_with_index_op DEPS pooling)
-op_library(lod_rank_table_op DEPS lod_rank_table)
-op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
-op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
-op_library(max_sequence_len_op DEPS lod_rank_table)
-op_library(sequence_conv_op DEPS context_project)
-op_library(sequence_pool_op DEPS sequence_pooling)
-op_library(lstm_op DEPS sequence2batch lstm_compute)
-op_library(lstmp_op DEPS sequence2batch lstm_compute)
-op_library(gru_op DEPS sequence2batch gru_compute)
-op_library(recurrent_op DEPS executor)
-op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function)
-op_library(cos_sim_op DEPS cos_sim_functor)
-op_library(parallel_do_op DEPS executor)
-op_library(create_reader_op DEPS reader)
-
-# Regist multiple Kernel to pybind
-if (WITH_GPU)
-
-op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS
-    vol2col depthwise_conv)
-
-op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
-op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
-op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
-  conv_transpose_cudnn_op.cu.cc DEPS vol2col)
-file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d, CUDNN);\n")
-file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(pool2d, CUDNN);\n")
-file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d_transpose, CUDNN);\n")
-else()
-op_library(conv_op SRCS conv_op.cc DEPS vol2col)
-op_library(pool_op SRCS pool_op.cc DEPS pooling)
-op_library(conv_transpose_op SRCS conv_transpose_op.cc DEPS vol2col)
-endif()
-
-# FIXME(typhoonzero): save/load depends lodtensor serialization functions
-op_library(save_op DEPS lod_tensor)
-op_library(load_op DEPS lod_tensor)
-op_library(save_combine_op DEPS lod_tensor)
-op_library(load_combine_op DEPS lod_tensor)
-
-list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
-foreach(src ${GENERAL_OPS})
-    op_library(${src})
-endforeach()
-file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(create_random_data_generator);\n")
-
-set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
-
-cc_test(gather_test SRCS gather_test.cc DEPS tensor)
-cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
-cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
-cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
-cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
-cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
-if(WITH_GPU)
-    cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
-endif()
-cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
-cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
deleted file mode 100644
index 8e8a3c7dd3036317fac29b709d7a29e18f017503..0000000000000000000000000000000000000000
--- a/paddle/operators/accuracy_op.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/accuracy_op.h"
-
-namespace paddle {
-namespace operators {
-
-class AccuracyOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Out"),
-                   "Input (Out) of accuracy op should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Indices"),
-                   "Input (Indices) of accuracy op should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"),
-                   "Input (Label) of accuracy op should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Accuracy"),
-                   "Output (Accuracy) of AccuracyOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Correct"),
-                   "Output (Correct) of AccuracyOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Total"),
-                   "Output (Total) of AccuracyOp should not be null.");
-
-    auto inference_dim = ctx->GetInputDim("Out");
-    auto label_dim = ctx->GetInputDim("Label");
-    // Assume indices has same shape as inference, because
-    // it's the output of topk.
-
-    PADDLE_ENFORCE_EQ(label_dim.size(), 2, "label's rank must be 2.");
-    PADDLE_ENFORCE_EQ(label_dim[1], 1, "label's second dimension must be 1");
-    PADDLE_ENFORCE_EQ(inference_dim[0], label_dim[0],
-                      "the inference tensor's num_rows must be"
-                      " the same as label.");
-
-    ctx->SetOutputDim("Accuracy", {1});
-    ctx->SetOutputDim("Correct", {1});
-    ctx->SetOutputDim("Total", {1});
-    ctx->ShareLoD("Out", /*->*/ "Accuracy");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
-        ctx.GetPlace());
-  }
-};
-
-class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  AccuracyOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    // TODO(typhoonzero): support both inference value and indices.
-    AddInput("Out", "The network output of topk (inferences)");
-    AddInput("Indices", "The the network output of topk (indices)");
-    AddInput("Label", "Label of the training data");
-    // TODO(typhoonzero): AddInput("Weight", ...
-    AddOutput("Accuracy", "The accuracy of current batch");
-    AddOutput("Correct", "The correct samples count of current batch");
-    AddOutput("Total", "The samples count of current batch");
-
-    AddComment(R"DOC(
-Accuracy Operator. 
-
-It will print accuracy rate for classification.
-The accuracy is calculated as follows:
-
-$$accuracy = \frac{NumOfCorrectPredicts}{NumOfAllSamples}$$
-
-Both the input Out and Label can carry the LoD (Level of Details)
-information, or not. But the output only shares the LoD information 
-with the input Out(Inference).
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-// FIXME(typhoonzero): types of T is for infernece data.
-// label data is always int.
-REGISTER_OP_CPU_KERNEL(accuracy,
-                       ops::AccuracyKernel<paddle::platform::CPUPlace, float>,
-                       ops::AccuracyKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
deleted file mode 100644
index 0aadd5af41531e54b357756441f92da668d4ec01..0000000000000000000000000000000000000000
--- a/paddle/operators/accuracy_op.cu
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/execution_policy.h>
-#include <thrust/reduce.h>
-#include "paddle/operators/accuracy_op.h"
-#include "paddle/platform/cuda_helper.h"
-#include "paddle/platform/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <int BlockSize>
-__global__ void AccuracyCudaKernel(const int N, const int D,
-                                   const int64_t* Xdata,
-                                   const int64_t* labeldata, int* correct_data,
-                                   float* accuracy, int* total_data) {
-  int count = 0;
-  __shared__ int total[BlockSize];
-
-  // support only 1 block
-  for (int i = threadIdx.x; i < (N); i += BlockSize) {
-    for (int j = 0; j < D; ++j) {
-      if (Xdata[i * D + j] == labeldata[i]) {
-        ++count;
-        break;
-      }
-    }
-  }
-  total[threadIdx.x] = count;
-  __syncthreads();
-
-  // reduce the count with init value 0, and output accuracy.
-  int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
-  if (threadIdx.x == 0) {
-    *correct_data = result;
-    *accuracy = static_cast<float>(result) / static_cast<float>(N);
-    *total_data = N;
-  }
-}
-
-template <typename T>
-class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    auto* inference = ctx.Input<Tensor>("Out");
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* label = ctx.Input<Tensor>("Label");
-
-    auto* accuracy = ctx.Output<Tensor>("Accuracy");
-    auto* correct = ctx.Output<Tensor>("Correct");
-    auto* total = ctx.Output<Tensor>("Total");
-    // FIXME(typhoonzero): only support indices currently
-    // if add support for output values, how to detect the data type?
-    const int64_t* indices_data = indices->data<int64_t>();
-    const int64_t* label_data = label->data<int64_t>();
-
-    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
-    int* total_data = total->mutable_data<int>(ctx.GetPlace());
-    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
-
-    int num_samples = static_cast<int>(inference->dims()[0]);
-    size_t infer_width = inference->dims()[1];
-    auto stream = ctx.cuda_device_context().stream();
-    platform::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream);
-
-    if (num_samples == 0) {
-      return;
-    }
-
-    AccuracyCudaKernel<
-        PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        num_samples, infer_width, indices_data, label_data, correct_data,
-        accuracy_data, total_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-// FIXME(typhoonzero): types of T is for inference data.
-// label data is always int64
-REGISTER_OP_CUDA_KERNEL(accuracy,
-                        paddle::operators::AccuracyOpCUDAKernel<float>,
-                        paddle::operators::AccuracyOpCUDAKernel<double>);
diff --git a/paddle/operators/accuracy_op.h b/paddle/operators/accuracy_op.h
deleted file mode 100644
index 04104a695fac6a967ad94780e31ba3fdd2ca2eda..0000000000000000000000000000000000000000
--- a/paddle/operators/accuracy_op.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class AccuracyKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<Tensor>("Out");
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* accuracy = ctx.Output<Tensor>("Accuracy");
-    auto* correct = ctx.Output<Tensor>("Correct");
-    auto* total = ctx.Output<Tensor>("Total");
-
-    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
-    int* total_data = total->mutable_data<int>(ctx.GetPlace());
-    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
-
-    const int64_t* indices_data = indices->data<int64_t>();
-    const int64_t* label_data = label->data<int64_t>();
-
-    size_t num_samples = inference->dims()[0];
-    size_t class_dim = inference->dims()[1];
-    *accuracy_data = 0.0f;
-
-    if (num_samples == 0) {
-      return;
-    }
-
-    int num_correct = 0;
-    // assume inference is already the topk of the output
-    for (size_t i = 0; i < num_samples; ++i) {
-      PADDLE_ENFORCE_GE(label_data[i], 0, "label must >= 0");
-      for (size_t j = 0; j < class_dim; ++j) {
-        if (indices_data[i * class_dim + j] == label_data[i]) {
-          ++num_correct;
-          break;
-        }
-      }
-    }
-
-    *correct_data = num_correct;
-    *total_data = num_samples;
-    *accuracy_data =
-        static_cast<float>(num_correct) / static_cast<float>(num_samples);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
deleted file mode 100644
index 4188858a90daf8b2c10eb6960393de977d467371..0000000000000000000000000000000000000000
--- a/paddle/operators/activation_op.cc
+++ /dev/null
@@ -1,615 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/activation_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ActivationOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class ActivationOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
-  }
-};
-
-class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Sigmoid operator");
-    AddOutput("Out", "Output of Sigmoid operator");
-    AddComment(R"DOC(
-Sigmoid Activation Operator
-
-$$out = \frac{1}{1 + e^{-x}}$$
-
-)DOC");
-  }
-};
-
-class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LogSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of LogSigmoid operator");
-    AddOutput("Out", "Output of LogSigmoid operator");
-    AddComment(R"DOC(
-Logsigmoid Activation Operator
-
-$$out = \log \frac{1}{1 + e^{-x}}$$
-
-)DOC");
-  }
-};
-
-class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ExpOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Exp operator");
-    AddOutput("Out", "Output of Exp operator");
-    AddComment(R"DOC(
-Exp Activation Operator.
-
-$out = e^x$
-
-)DOC");
-  }
-};
-
-class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Relu operator");
-    AddOutput("Out", "Output of Relu operator");
-    AddComment(R"DOC(
-Relu Activation Operator.
-
-$out = \max(x, 0)$
-
-)DOC");
-  }
-};
-
-class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LeakyReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of LeakyRelu operator");
-    AddOutput("Out", "Output of LeakyRelu operator");
-    AddAttr<float>("alpha", "The small negative slope").SetDefault(0.02f);
-    AddComment(R"DOC(
-LeakyRelu Activation Operator.
-
-$out = \max(x, \alpha * x)$
-
-)DOC");
-  }
-};
-
-class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SoftShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Softshrink operator");
-    AddOutput("Out", "Output of Softshrink operator");
-    AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
-    AddComment(R"DOC(
-Softshrink Activation Operator.
-
-$$
-out = \begin{cases} 
-    x - \lambda, \text{if } x > \lambda \\
-    x + \lambda, \text{if } x < -\lambda \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
-
-)DOC");
-  }
-};
-
-class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  TanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Tanh operator");
-    AddOutput("Out", "Output of Tanh operator");
-    AddComment(R"DOC(
-Tanh Activation Operator.
-
-$$out = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
-
-)DOC");
-  }
-};
-
-class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  TanhShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of TanhShrink operator");
-    AddOutput("Out", "Output of TanhShrink operator");
-    AddComment(R"DOC(
-TanhShrink Activation Operator.
-
-$$out = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
-
-)DOC");
-  }
-};
-
-class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  HardShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of HardShrink operator");
-    AddOutput("Out", "Output of HardShrink operator");
-    AddAttr<float>("threshold", "The value of threshold for HardShrink")
-        .SetDefault(0.5f);
-    AddComment(R"DOC(
-HardShrink Activation Operator.
-
-$$
-out = \begin{cases} 
-    x, \text{if } x > \lambda \\
-    x, \text{if } x < -\lambda \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
-
-)DOC");
-  }
-};
-
-class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SqrtOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Sqrt operator");
-    AddOutput("Out", "Output of Sqrt operator");
-    AddComment(R"DOC(
-Sqrt Activation Operator.
-
-$out = \sqrt{x}$
-
-)DOC");
-  }
-};
-
-class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  AbsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Abs operator");
-    AddOutput("Out", "Output of Abs operator");
-    AddComment(R"DOC(
-Abs Activation Operator.
-
-$out = |x|$
-
-)DOC");
-  }
-};
-
-class CeilOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CeilOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Ceil operator");
-    AddOutput("Out", "Output of Ceil operator");
-    AddComment(R"DOC(
-Ceil Activation Operator.
-
-$out = ceil(x)$
-
-)DOC");
-  }
-};
-
-class FloorOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  FloorOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Floor operator");
-    AddOutput("Out", "Output of Floor operator");
-    AddComment(R"DOC(
-Floor Activation Operator.
-
-$out = floor(x)$
-
-)DOC");
-  }
-};
-
-class RoundOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  RoundOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Round operator");
-    AddOutput("Out", "Output of Round operator");
-    AddComment(R"DOC(
-Round Activation Operator.
-
-$out = [x]$
-
-)DOC");
-  }
-};
-
-class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ReciprocalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Reciprocal operator");
-    AddOutput("Out", "Output of Reciprocal operator");
-    AddComment(R"DOC(
-Reciprocal Activation Operator.
-
-$$out = \frac{1}{x}$$
-
-)DOC");
-  }
-};
-
-class LogOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LogOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Log operator");
-    AddOutput("Out", "Output of Log operator");
-    AddComment(R"DOC(
-Log Activation Operator.
-
-$out = \ln(x)$
-
-Natural logarithm of x.
-
-)DOC");
-  }
-};
-
-class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SquareOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Square operator");
-    AddOutput("Out", "Output of Square operator");
-    AddComment(R"DOC(
-Square Activation Operator.
-
-$out = x^2$
-
-)DOC");
-  }
-};
-
-class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SoftplusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Softplus operator");
-    AddOutput("Out", "Output of Softplus operator");
-    AddComment(R"DOC(
-Softplus Activation Operator.
-
-$out = \ln(1 + e^{x})$
-
-)DOC");
-  }
-};
-
-class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SoftsignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Softsign operator");
-    AddOutput("Out", "Output of Softsign operator");
-    AddComment(R"DOC(
-Softsign Activation Operator.
-
-$$out = \frac{x}{1 + |x|}$$
-
-)DOC");
-  }
-};
-
-class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  BReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of BRelu operator");
-    AddOutput("Out", "Output of BRelu operator");
-    AddAttr<float>("t_min", "The min marginal value of BRelu")
-        .SetDefault(static_cast<float>(0));
-    AddAttr<float>("t_max", "The max marginal value of BRelu")
-        .SetDefault(static_cast<float>(24));
-    AddComment(R"DOC(
-BRelu Activation Operator.
-
-$out = \max(\min(x, t_{min}), t_{max})$
-
-)DOC");
-  }
-};
-
-class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SoftReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of SoftRelu operator");
-    AddOutput("Out", "Output of SoftRelu operator");
-    AddAttr<float>("threshold", "The threshold value of SoftRelu")
-        .SetDefault(40.0f);
-    AddComment(R"DOC(
-SoftRelu Activation Operator.
-
-$out = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
-
-)DOC");
-  }
-};
-
-class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ELUOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of ELU operator");
-    AddOutput("Out", "Output of ELU operator");
-    AddAttr<float>("alpha", "The alpha value of ELU").SetDefault(1.0f);
-    AddComment(R"DOC(
-ELU Activation Operator.
-
-Applies the following element-wise computation on the input according to
-https://arxiv.org/abs/1511.07289.
-
-$out = \max(0, x) + \min(0, \alpha * (e^x - 1))$
-
-)DOC");
-  }
-};
-
-class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  Relu6OpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Relu6 operator");
-    AddOutput("Out", "Output of Relu6 operator");
-    AddAttr<float>("threshold", "The threshold value of Relu6")
-        .SetDefault(6.0f);
-    AddComment(R"DOC(
-Relu6 Activation Operator.
-
-$out = \min(\max(0, x), 6)$
-
-)DOC");
-  }
-};
-
-class PowOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  PowOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Pow operator");
-    AddOutput("Out", "Output of Pow operator");
-    AddAttr<float>("factor", "The exponential factor of Pow").SetDefault(1.0f);
-    AddComment(R"DOC(
-Pow Activation Operator.
-
-$out = x^{factor}$
-
-)DOC");
-  }
-};
-
-class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  STanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of STanh operator");
-    AddOutput("Out", "Output of STanh operator");
-    AddAttr<float>("scale_a", "The scale parameter of a for the input")
-        .SetDefault(2.0f / 3.0f);
-    AddAttr<float>("scale_b", "The scale parameter of b for the input")
-        .SetDefault(1.7159f);
-    AddComment(R"DOC(
-STanh Activation Operator.
-
-$$out = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
-
-)DOC");
-  }
-};
-
-class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ThresholdedReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of ThresholdedRelu operator");
-    AddOutput("Out", "Output of ThresholdedRelu operator");
-    AddAttr<float>("threshold", "The threshold location of activation")
-        .SetDefault(1.0f);
-    AddComment(R"DOC(
-ThresholdedRelu Activation Operator.
-
-$$
-out = \begin{cases} 
-    x, \text{if } x > threshold \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
-
-)DOC");
-  }
-};
-
-class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  HardSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of HardSigmoid operator");
-    AddOutput("Out", "Output of HardSigmoid operator");
-    AddAttr<float>("slope", "Slope for linear approximation of sigmoid")
-        .SetDefault(0.2f);
-    AddAttr<float>("offset", "Offset for linear approximation of sigmoid")
-        .SetDefault(0.5f);
-    AddComment(R"DOC(
-HardSigmoid Activation Operator.
-
-Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), 
-which is much faster than sigmoid.
-
-$out = \max(0, \min(1, slope * x + shift))$
-
-The slope should be positive. The offset can be either positive or negative.
-The default slope and shift are set according to the above reference.
-It is recommended to use the defaults for this activation.
-
-)DOC");
-  }
-};
-
-class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SwishOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Swish operator");
-    AddOutput("Out", "Output of Swish operator");
-    AddAttr<float>("beta", "Constant beta of swish operator").SetDefault(1.0f);
-    AddComment(R"DOC(
-Swish Activation Operator.
-
-$$out = \frac{x}{1 + e^{- \beta x}}$$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP(sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, sigmoid_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker,
-            logsigmoid_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
-            tanh_shrink_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker,
-            softshrink_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(floor, ops::ActivationOp, ops::FloorOpMaker, floor_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(round, ops::ActivationOp, ops::RoundOpMaker, round_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker,
-            reciprocal_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(log, ops::ActivationOp, ops::LogOpMaker, log_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(square, ops::ActivationOp, ops::SquareOpMaker, square_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(softplus, ops::ActivationOp, ops::SoftplusOpMaker, softplus_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(softsign, ops::ActivationOp, ops::SoftsignOpMaker, softsign_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker, brelu_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker,
-            leaky_relu_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker, soft_relu_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(elu, ops::ActivationOp, ops::ELUOpMaker, elu_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(relu6, ops::ActivationOp, ops::Relu6OpMaker, relu6_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker, pow_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker, stanh_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker,
-            hard_shrink_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(thresholded_relu, ops::ActivationOp, ops::ThresholdedReluOpMaker,
-            thresholded_relu_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker,
-            hard_sigmoid_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
-            ops::ActivationOpGrad);
-
-#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)   \
-  REGISTER_OP_CPU_KERNEL(                                                 \
-      act_type, ops::ActivationKernel<paddle::platform::CPUDeviceContext, \
-                                      ops::functor<float>>,               \
-      ops::ActivationKernel<paddle::platform::CPUDeviceContext,           \
-                            ops::functor<double>>);                       \
-  REGISTER_OP_CPU_KERNEL(                                                 \
-      act_type##_grad,                                                    \
-      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
-                                ops::grad_functor<float>>,                \
-      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
-                                ops::grad_functor<double>>);
-
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
diff --git a/paddle/operators/activation_op.cu b/paddle/operators/activation_op.cu
deleted file mode 100644
index b9ccdf639cf4a9ea80d530e550c16089e50c44e0..0000000000000000000000000000000000000000
--- a/paddle/operators/activation_op.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/activation_op.h"
-
-namespace ops = paddle::operators;
-
-#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor)   \
-  REGISTER_OP_CUDA_KERNEL(                                                 \
-      act_type, ops::ActivationKernel<paddle::platform::CUDADeviceContext, \
-                                      ops::functor<float>>,                \
-      ops::ActivationKernel<paddle::platform::CUDADeviceContext,           \
-                            ops::functor<double>>);                        \
-  REGISTER_OP_CUDA_KERNEL(                                                 \
-      act_type##_grad,                                                     \
-      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
-                                ops::grad_functor<float>>,                 \
-      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
-                                ops::grad_functor<double>>);
-
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
deleted file mode 100644
index c0809abc05104c1e8c1f42331c0530724dd1472f..0000000000000000000000000000000000000000
--- a/paddle/operators/activation_op.h
+++ /dev/null
@@ -1,799 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/detail/safe_ref.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename Functor>
-class ActivationKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& X = detail::Ref(context.Input<framework::Tensor>("X"),
-                          "Cannot get input tensor X, variable name = %s",
-                          context.op().Input("X"));
-
-    auto& Out = detail::Ref(context.Output<framework::Tensor>("Out"),
-                            "Cannot get output tensor Out, variable name = %s",
-                            context.op().Output("Out"));
-    Out.mutable_data<T>(context.GetPlace());
-    auto x = framework::EigenVector<T>::Flatten(X);
-    auto out = framework::EigenVector<T>::Flatten(Out);
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-    Functor functor;
-
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = context.Attr<float>(attr.first);
-    }
-    functor(*place, x, out);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class ActivationGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Out = context.Input<framework::Tensor>("Out");
-    auto* dOut =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    dX->mutable_data<T>(context.GetPlace());
-
-    auto dout = framework::EigenVector<T>::Flatten(*dOut);
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-    Functor functor;
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = context.Attr<float>(attr.first);
-    }
-    functor(*place, x, out, dout, dx);
-  }
-};
-
-template <typename T>
-struct BaseActivationFunctor {
-  using ELEMENT_TYPE = T;
-
-  using AttrPair = std::vector<std::pair<const char*, float*>>;
-
-  AttrPair GetAttrs() { return AttrPair(); }
-};
-
-// sigmoid(x) = 1 / (1 + exp(-x))
-template <typename T>
-struct SigmoidFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
-  }
-};
-
-template <typename T>
-struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * out * (static_cast<T>(1) - out);
-  }
-};
-
-// Originally: logsigmoid(x) = -log (1 + exp(-x))
-// For numerical stability, we can use the log-sum-exp trick:
-// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
-// We can rewrite the above equation as:
-// out = -log( exp(0) + exp(-x)) [since exp(0) = 1]
-//   = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
-//   = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
-//           max(-x, 0)))
-//   = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
-//   = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))
-//
-// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0))
-// + exp(-x - max(-x, 0))))
-template <typename T>
-struct LogSigmoidFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
-    out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log());
-  }
-};
-
-// Originally: f' = exp(-x) / (1 + exp(-x))
-// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) +
-// exp(-x - max(-x, 0)))
-template <typename T>
-struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
-    dx.device(d) =
-        dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
-  }
-};
-
-// exp(x) = e^x
-template <typename T>
-struct ExpFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.exp();
-  }
-};
-
-template <typename T>
-struct ExpGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * out;
-  }
-};
-
-// relu(x) = max(x, 0)
-template <typename T>
-struct ReluFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.cwiseMax(static_cast<T>(0));
-  }
-};
-
-template <typename T>
-struct ReluGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>();
-  }
-};
-
-// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-template <typename T>
-struct TanhFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.tanh();
-  }
-};
-
-template <typename T>
-struct TanhGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (static_cast<T>(1) - out * out);
-  }
-};
-
-// tanhshrink(x) = x - tanh(x)
-// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-template <typename T>
-struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x - x.tanh();
-  }
-};
-
-template <typename T>
-struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (x.tanh() * x.tanh());
-  }
-};
-
-// tanhshrink(x) = x - tanh(x)
-// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-template <typename T>
-struct HardShrinkFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>().eval();
-    auto temp2 = (x > static_cast<T>(threshold)).template cast<T>().eval();
-    out.device(d) = x * (temp1 + temp2);
-  }
-};
-
-template <typename T>
-struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>().eval();
-    auto temp2 = (x > static_cast<T>(threshold)).template cast<T>().eval();
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
-  }
-};
-
-// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
-// otherwise
-template <typename T>
-struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
-  float lambda;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto lambdaT = static_cast<T>(lambda);
-    auto temp1 = (x > lambdaT).template cast<T>().eval();
-    auto temp2 = (x < -lambdaT).template cast<T>().eval();
-    out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
-  }
-};
-
-template <typename T>
-struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
-  float lambda;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto lambdaT = static_cast<T>(lambda);
-    auto temp1 = (x > lambdaT).template cast<T>().eval();
-    auto temp2 = (x < -lambdaT).template cast<T>().eval();
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
-  }
-};
-
-// sqrt(x) = x^(1/2)
-template <typename T>
-struct SqrtFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.sqrt();
-  }
-};
-
-template <typename T>
-struct SqrtGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    const Out out_conj = Eigen::numext::conj(out);
-    dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
-  }
-};
-
-// ceil(x) = ceiling(x)
-template <typename T>
-struct CeilFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.ceil();
-  }
-};
-
-template <typename T>
-struct ZeroGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = static_cast<T>(0) / x;
-  }
-};
-
-// floor(x) = flooring(x)
-template <typename T>
-struct FloorFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.floor();
-  }
-};
-
-// round(x) = [x]
-template <typename T>
-struct RoundFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.round();
-  }
-};
-
-// abs(x) = |x|
-template <typename T>
-struct AbsFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.abs();
-  }
-};
-
-template <typename T>
-struct AbsGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * x.sign();
-  }
-};
-
-// reciprocal(x) = 1 / x
-template <typename T>
-struct ReciprocalFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = static_cast<T>(1) / x;
-  }
-};
-
-template <typename T>
-struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(-1) * out * out;
-  }
-};
-
-// log(x) = natural logarithm of x
-template <typename T>
-struct LogFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.log();
-  }
-};
-
-template <typename T>
-struct LogGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (static_cast<T>(1) / x);
-  }
-};
-
-// square(x) = x^2
-template <typename T>
-struct SquareFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.square();
-  }
-};
-
-template <typename T>
-struct SquareGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(2) * x;
-  }
-};
-
-template <typename T>
-struct BReluFunctor : public BaseActivationFunctor<T> {
-  float t_min;
-  float t_max;
-
-  // NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
-  // not polymorphism for speed.
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) =
-        x.cwiseMax(static_cast<T>(t_min)).cwiseMin(static_cast<T>(t_max));
-  }
-};
-
-template <typename T>
-struct BReluGradFunctor : public BaseActivationFunctor<T> {
-  float t_min;
-  float t_max;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout *
-                   ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
-                       .template cast<T>();
-  }
-};
-
-// relu6(x) = min(max(0, x), 6)
-template <typename T>
-struct Relu6Functor : public BaseActivationFunctor<T> {
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) =
-        x.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(threshold));
-  }
-};
-
-template <typename T>
-struct Relu6GradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout *
-                   ((x > static_cast<T>(0)) * (x < static_cast<T>(threshold)))
-                       .template cast<T>();
-  }
-};
-
-// softplus(x) = log(1 + exp(x))
-// When x is a very large positive number, exp(x) may explode to inf,
-// Using trick below for numerical stability
-// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
-// Then: softplus(x) = max(x, 0) + log(exp(-max(x, 0)) + exp(x - max(x, 0)))
-template <typename T>
-struct SoftplusFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) {
-    auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
-    out.device(d) = temp + (((-temp).exp() + (x - temp).exp()).log());
-  }
-};
-
-// d(softplus(x))/dx = exp(x) / (1 + exp(x))
-// For numerical stability:
-// d(softplus(x))/dx = exp(x - max(x, 0)) / (exp(-max(x, 0)) +
-// exp(x - max(x, 0)))
-template <typename T>
-struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) {
-    auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
-    dx.device(d) =
-        dout * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp()));
-  }
-};
-
-// softsign(x) = x / (1 + |x|)
-template <typename T>
-struct SoftsignFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) {
-    out.device(d) = x / (static_cast<T>(1) + x.abs());
-  }
-};
-
-// d(softsign(x))/dx = 1 / (1 + |x|)^2
-// Taken from https://en.wikipedia.org/wiki/Activation_function
-template <typename T>
-struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) {
-    dx.device(d) =
-        dout * (static_cast<T>(1) / (static_cast<T>(1) + x.abs()).square());
-  }
-};
-
-template <typename T>
-struct SoftReluFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto tmp = static_cast<T>(threshold);
-    auto temp = x.cwiseMax(-tmp).cwiseMin(tmp);
-    out.device(d) = (static_cast<T>(1) + temp.exp()).log();
-  }
-};
-
-template <typename T>
-struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto tmp = static_cast<T>(threshold);
-    auto temp = ((x > -tmp) * (x < tmp)).template cast<T>().eval();
-    dx.device(d) = dout * (static_cast<T>(1) - (-out).exp()) * temp;
-  }
-};
-
-template <typename T>
-struct LeakyReluFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
-  }
-};
-
-template <typename T>
-struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 = static_cast<T>(alpha) *
-                 (x < static_cast<T>(0)).template cast<T>().eval();
-    auto temp2 = (x >= static_cast<T>(0)).template cast<T>().eval();
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
-  }
-};
-
-template <typename T>
-struct ELUFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.cwiseMax(static_cast<T>(0)) +
-                    (static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)))
-                        .cwiseMin(static_cast<T>(0));
-  }
-};
-
-template <typename T>
-struct ELUGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>() +
-                   dout * (out + static_cast<T>(alpha)) *
-                       (x < static_cast<T>(0)).template cast<T>();
-  }
-};
-
-// FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198
-template <typename T>
-struct PowFunctor : public BaseActivationFunctor<T> {
-  float factor;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"factor", &factor}};
-  }
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.pow(static_cast<T>(factor));
-  }
-};
-
-template <typename T>
-struct PowGradFunctor : public BaseActivationFunctor<T> {
-  float factor;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"factor", &factor}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(factor) *
-                   x.pow(static_cast<T>(factor - static_cast<T>(1)));
-  }
-};
-
-template <typename T>
-struct STanhFunctor : public BaseActivationFunctor<T> {
-  float scale_a;
-  float scale_b;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) =
-        static_cast<T>(scale_b) * (static_cast<T>(scale_a) * x).tanh();
-  }
-};
-
-template <typename T>
-struct STanhGradFunctor : public BaseActivationFunctor<T> {
-  float scale_a;
-  float scale_b;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
-  }
-
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto a = static_cast<T>(scale_a);
-    auto b = static_cast<T>(scale_b);
-    auto temp = (a * x).tanh() * (a * x).tanh();
-    dx.device(d) = dout * a * b * (static_cast<T>(1) - temp);
-  }
-};
-
-template <typename T>
-struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto th = static_cast<T>(threshold);
-    out.device(d) = (x > th).template cast<T>() * x;
-  }
-};
-
-template <typename T>
-struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto th = static_cast<T>(threshold);
-    dx.device(d) = dout * (x > th).template cast<T>();
-  }
-};
-
-template <typename T>
-struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
-  float slope;
-  float offset;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp = x * static_cast<T>(slope) + static_cast<T>(offset);
-    out.device(d) =
-        temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
-  }
-};
-
-template <typename T>
-struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  float slope;
-  float offset;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout *
-                   ((out > static_cast<T>(0)) * (out < static_cast<T>(1)))
-                       .template cast<T>() *
-                   static_cast<T>(slope);
-  }
-};
-
-template <typename T>
-struct SwishFunctor : public BaseActivationFunctor<T> {
-  float beta;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"beta", &beta}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x / (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
-  }
-};
-
-template <typename T>
-struct SwishGradFunctor : public BaseActivationFunctor<T> {
-  float beta;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"beta", &beta}};
-  }
-
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 = static_cast<T>(1) /
-                 (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
-    auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
-    dx.device(d) = dout * ((beta * out) + temp2);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define FOR_EACH_KERNEL_FUNCTOR(__macro)                             \
-  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
-  __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);     \
-  __macro(exp, ExpFunctor, ExpGradFunctor);                          \
-  __macro(relu, ReluFunctor, ReluGradFunctor);                       \
-  __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
-  __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
-  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
-  __macro(abs, AbsFunctor, AbsGradFunctor);                          \
-  __macro(ceil, CeilFunctor, ZeroGradFunctor);                       \
-  __macro(floor, FloorFunctor, ZeroGradFunctor);                     \
-  __macro(round, RoundFunctor, ZeroGradFunctor);                     \
-  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);     \
-  __macro(log, LogFunctor, LogGradFunctor);                          \
-  __macro(square, SquareFunctor, SquareGradFunctor);                 \
-  __macro(brelu, BReluFunctor, BReluGradFunctor);                    \
-  __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor);          \
-  __macro(pow, PowFunctor, PowGradFunctor);                          \
-  __macro(stanh, STanhFunctor, STanhGradFunctor);                    \
-  __macro(softplus, SoftplusFunctor, SoftplusGradFunctor);           \
-  __macro(softsign, SoftsignFunctor, SoftsignGradFunctor);           \
-  __macro(relu6, Relu6Functor, Relu6GradFunctor);                    \
-  __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);       \
-  __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor);    \
-  __macro(elu, ELUFunctor, ELUGradFunctor);                          \
-  __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor);    \
-  __macro(hard_sigmoid, HardSigmoidFunctor, HardSigmoidGradFunctor); \
-  __macro(swish, SwishFunctor, SwishGradFunctor);                    \
-  __macro(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor);
diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc
deleted file mode 100644
index d8a9491c8247ac463e01606dac248780d5284236..0000000000000000000000000000000000000000
--- a/paddle/operators/adadelta_op.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/adadelta_op.h"
-
-namespace paddle {
-namespace operators {
-
-class AdadeltaOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredGrad"),
-                   "Input(AvgSquaredGrad) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"),
-                   "Input(AvgSquaredUpdate) of AdadeltaOp should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("AvgSquaredGradOut"),
-        "Output(AvgSquaredGradOut) of AdadeltaOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("AvgSquaredUpdateOut"),
-        "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null.");
-
-    auto param_dim = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dim, ctx->GetInputDim("Grad"),
-        "param and grad input of AdadeltaOp should have same dimension");
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"),
-                      "Param and AvgSquaredGrad input of AdadeltaOp "
-                      "should have same dimension");
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"),
-                      "Param and AvgSquaredUpdate input of AdadeltaOp "
-                      "should have same dimension");
-
-    ctx->SetOutputDim("ParamOut", param_dim);
-    ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
-    ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim);
-  }
-};
-
-class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  AdadeltaOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Param", "(Tensor) Input parameter");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient");
-    AddInput("AvgSquaredUpdate",
-             "(Tensor) Input average of squared parameter updates");
-
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-    AddOutput("AvgSquaredGradOut",
-              "(Tensor) Output average of squared gradient");
-    AddOutput("AvgSquaredUpdateOut",
-              "(Tensor) Output average of squared parameter updates");
-
-    AddAttr<float>("rho",
-                   "(float, default 0.95) Exponential decay rate "
-                   "for squared gradients.")
-        .SetDefault(0.95f);
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-6) Constant for "
-                   "numerical stability")
-        .SetDefault(1.0e-6f);
-    AddComment(R"DOC(
-Adadelta Optimizer.
-
-Adadelta optimizer is implemented as explained in:
-https://arxiv.org/abs/1212.5701
-Adadelta is a per-dimension adaptive learning rate method used
-for gradient descent.
-
-Adadelta updates are as follows:
-
-$$
-avg\_squared\_grad\_out = \rho * avg\_squared\_grad + (1 - \rho) * grad * grad \\
-param\_update =  - \sqrt{\frac{avg\_squared\_update + \epsilon}{avg\_squared\_grad\_out + \epsilon}} * grad \\
-avg\_squared\_update\_out = \rho * avg\_squared\_update + (1 - \rho) * {param\_update}^2 \\
-param\_out = param + param\_update
-$$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adadelta_op.cu b/paddle/operators/adadelta_op.cu
deleted file mode 100644
index 91294a0d5d148a43bb95ab83ae8176b475fde9de..0000000000000000000000000000000000000000
--- a/paddle/operators/adadelta_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/adadelta_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adadelta_op.h b/paddle/operators/adadelta_op.h
deleted file mode 100644
index 819d0845dbdafab95d993a455013300fa71495e2..0000000000000000000000000000000000000000
--- a/paddle/operators/adadelta_op.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AdadeltaOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto avg_squared_grad_out_tensor =
-        ctx.Output<framework::Tensor>("AvgSquaredGradOut");
-    auto avg_squared_update_out_tensor =
-        ctx.Output<framework::Tensor>("AvgSquaredUpdateOut");
-
-    param_out_tensor->mutable_data<T>(ctx.GetPlace());
-    avg_squared_grad_out_tensor->mutable_data<T>(ctx.GetPlace());
-    avg_squared_update_out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    T rho = static_cast<T>(ctx.Attr<float>("rho"));
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
-    auto param = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Param"));
-    auto grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Grad"));
-    // Squared gradient accumulator
-    auto avg_squared_grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("AvgSquaredGrad"));
-    // Squared updates accumulator
-    auto avg_squared_update = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("AvgSquaredUpdate"));
-    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
-    auto avg_squared_grad_out =
-        framework::EigenVector<T>::Flatten(*avg_squared_grad_out_tensor);
-    auto avg_squared_update_out =
-        framework::EigenVector<T>::Flatten(*avg_squared_update_out_tensor);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    avg_squared_grad_out.device(place) =
-        rho * avg_squared_grad + (1 - rho) * grad.square();
-    auto update =
-        -((avg_squared_update + epsilon) / (avg_squared_grad_out + epsilon))
-             .sqrt() *
-        grad;
-    avg_squared_update_out.device(place) =
-        rho * avg_squared_update + (1 - rho) * update.square();
-    param_out.device(place) = param + update;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc
deleted file mode 100644
index c83318a272302a474c37ce86365201acf56b9cad..0000000000000000000000000000000000000000
--- a/paddle/operators/adagrad_op.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/adagrad_op.h"
-
-#include <cmath>
-
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-class AdagradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of AdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of AdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment"),
-                   "Input(Moment) of AdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of AdagradOp should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of AdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
-                   "Output(MomentOut) of AdagradOp should not be null.");
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "LearningRate should have one element");
-    auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Grad"),
-        "Param and Grad input of AdagradOp should have the same dimension.");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Moment"),
-        "Param and Moment input of AdagradOp should have the same dimension.");
-
-    ctx->SetOutputDim("ParamOut", param_dims);
-    ctx->SetOutputDim("MomentOut", param_dims);
-  }
-};
-
-class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  AdagradOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Param", "(Tensor) Input parameter");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("Moment", "(Tensor) Second moment");
-    AddInput("LearningRate", "(Tensor) Learning rate");
-
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-    AddOutput("MomentOut", "(Tensor) Output second moment");
-
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-6) "
-                   "Constant for numerical stability")
-        .SetDefault(1.0e-6f);
-    AddComment(R"DOC(
-
-Adaptive Gradient Algorithm (Adagrad).
-
-The update is done as follows:
-
-$$moment\_out = moment + grad * grad \\
-param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
-$$
-
-The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-does not have the epsilon attribute. It is added here in our implementation
-as also proposed here: http://cs231n.github.io/neural-networks-3/#ada
-for numerical stability to avoid the division by zero error.
-
-)DOC");
-  }
-};
-
-namespace {
-size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
-  return std::find(rows.begin(), rows.end(), value) - rows.begin();
-}
-}  // namespace
-
-template <typename T>
-struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& grad,
-                  const framework::Tensor& learning_rate, T epsilon,
-                  framework::Tensor* moment, framework::Tensor* param) {
-    // 1. g_m.rows = set(g.rows)
-    auto grad_width = grad.value().dims()[1];
-    math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
-    auto grad_merge = merge_func(context, grad);
-    auto& merge_rows = grad_merge.rows();
-    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
-
-    // 2. m += g_m * g_m
-    math::scatter::Mul<platform::CPUDeviceContext, T> sqare_func;
-    auto grad_square = sqare_func(context, grad_merge, grad_merge);
-
-    math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
-    functor(context, grad_square, moment);
-
-    // 3. update parameter
-    auto* lr = learning_rate.data<T>();
-    auto* param_data = param->data<T>();
-    auto* moment_data = moment->data<T>();
-
-    for (size_t i = 0; i < merge_rows.size(); i++) {
-      for (int64_t j = 0; j < grad_width; j++) {
-        param_data[merge_rows[i] * grad_width + j] -=
-            lr[0] * grad_merge_data[i * grad_width + j] /
-            (std::sqrt(moment_data[merge_rows[i] * grad_width + j]) + epsilon);
-      }
-    }
-  }
-};
-
-template struct SparseAdagradFunctor<platform::CPUDeviceContext, float>;
-template struct SparseAdagradFunctor<platform::CPUDeviceContext, double>;
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    adagrad, ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu
deleted file mode 100644
index 00cb6e9cafb4e79ed3d59cd4a6e40ea132e5efda..0000000000000000000000000000000000000000
--- a/paddle/operators/adagrad_op.cu
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/adagrad_op.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/selected_rows_functor.h"
-#include "paddle/platform/cuda_helper.h"
-
-namespace paddle {
-namespace operators {
-
-namespace {
-
-template <typename T, int block_size>
-__global__ void MergeGradKernel(const T* grad, const int64_t* grad_rows,
-                                T* grad_merge, const int64_t* grad_merge_rows,
-                                size_t grad_merge_rows_size,
-                                int64_t row_numel) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-  __shared__ size_t grad_merge_idx;
-
-  if (tid == 0) {
-    for (size_t i = 0; i < grad_merge_rows_size; i++) {
-      if (grad_rows[ty] == grad_merge_rows[i]) {
-        grad_merge_idx = i;
-      }
-    }
-  }
-
-  __syncthreads();
-
-  grad += ty * row_numel;
-  grad_merge += grad_merge_idx * row_numel;
-  for (int index = tid; index < row_numel; index += block_size) {
-    paddle::platform::CudaAtomicAdd(grad_merge + index, grad[index]);
-  }
-}
-
-template <typename T, int block_size>
-__global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows,
-                                           const T* learning_rate, T* param,
-                                           T* moment, int64_t row_numel,
-                                           T epsilon) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  grad += ty * row_numel;
-  param += rows[ty] * row_numel;
-  moment += rows[ty] * row_numel;
-
-  for (int index = tid; index < row_numel; index += block_size) {
-    // Since index in rows of SelectedRows can be duplicate, we have to use
-    // Atomic Operation to avoid concurrent write error.
-    paddle::platform::CudaAtomicAdd(param + index,
-                                    -1.0 * learning_rate[0] * grad[index] /
-                                        (sqrt(moment[index]) + epsilon));
-  }
-}
-}  // namespace
-
-template <typename T>
-struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::SelectedRows& grad,
-                  const framework::Tensor& learning_rate, T epsilon,
-                  framework::Tensor* moment, framework::Tensor* param) {
-    // 1. g_m.rows = set(g.rows)
-    auto grad_width = grad.value().dims()[1];
-    math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func;
-    auto grad_merge = merge_func(context, grad);
-    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
-    framework::Vector<int64_t> merge_rows(grad_merge.rows());
-    // 2. m += g_m * g_m
-    math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func;
-    auto grad_square = sqare_func(context, grad_merge, grad_merge);
-
-    math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
-    functor(context, grad_square, moment);
-
-    // 3. update parameter
-    auto* lr = learning_rate.data<T>();
-    auto* param_data = param->data<T>();
-    auto* moment_data = moment->data<T>();
-
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid2(1, merge_rows.size());
-    SparseAdagradFunctorKernel<
-        T, 256><<<grid2, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(grad_merge_data, merge_rows.cuda_data(), lr,
-                                   param_data, moment_data, grad_width,
-                                   epsilon);
-  }
-};
-
-template struct SparseAdagradFunctor<platform::CUDADeviceContext, float>;
-template struct SparseAdagradFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    adagrad, ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adagrad_op.h b/paddle/operators/adagrad_op.h
deleted file mode 100644
index 66f5b0f449a4f11a3c734c98a6a97833763348a1..0000000000000000000000000000000000000000
--- a/paddle/operators/adagrad_op.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-struct SparseAdagradFunctor {
-  void operator()(const DeviceContext& context,
-                  const framework::SelectedRows& grad,
-                  const framework::Tensor& learning_rate, T epsilon,
-                  framework::Tensor* moment, framework::Tensor* param);
-};
-
-template <typename DeviceContext, typename T>
-class AdagradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto* moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
-
-    param_out_tensor->mutable_data<T>(ctx.GetPlace());
-    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
-    auto* grad_var = ctx.InputVar("Grad");
-    if (grad_var->IsType<framework::LoDTensor>()) {
-      auto param = framework::EigenVector<T>::Flatten(
-          *ctx.Input<framework::Tensor>("Param"));
-      auto grad = framework::EigenVector<T>::Flatten(
-          *ctx.Input<framework::Tensor>("Grad"));
-      auto moment = framework::EigenVector<T>::Flatten(
-          *ctx.Input<framework::Tensor>("Moment"));
-      auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-
-      auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
-      auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-      auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-
-      moment_out.device(*place) = moment + grad * grad;
-      Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-      if (platform::is_cpu_place(ctx.GetPlace())) {
-        auto* lr = learning_rate->data<T>();
-        param_out.device(*place) =
-            param - lr[0] * grad / (moment_out.sqrt() + epsilon);
-      } else {
-        auto lr = framework::EigenVector<T>::Flatten(*learning_rate);
-        param_out.device(*place) =
-            param -
-            lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
-      }
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      auto* param_tensor = ctx.Input<framework::Tensor>("Param");
-      PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor);
-
-      auto* moment_tensor = ctx.Input<framework::Tensor>("Moment");
-      PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor);
-
-      SparseAdagradFunctor<DeviceContext, T> functor;
-      functor(ctx.template device_context<DeviceContext>(),
-              *ctx.Input<framework::SelectedRows>("Grad"),
-              *ctx.Input<framework::Tensor>("LearningRate"), epsilon,
-              moment_out_tensor, param_out_tensor);
-    } else {
-      PADDLE_THROW("Unsupported Variable Type of Grad");
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/adam_op.cc b/paddle/operators/adam_op.cc
deleted file mode 100644
index 03527de936bf736d572fb0140033bde4db990981..0000000000000000000000000000000000000000
--- a/paddle/operators/adam_op.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/adam_op.h"
-
-namespace paddle {
-namespace operators {
-
-class AdamOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment1"),
-                   "Input(Moment1) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment2"),
-                   "Input(Moment2) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
-                   "Input(Beta1Pow) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"),
-                   "Input(Beta2Pow) of AdamOp should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"),
-                   "Output(Moment1Out) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
-                   "Output(Moment2Out) of AdamOp should not be null.");
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning rate should have 1 dimension");
-    auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
-    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
-                      "Beta1 power accumulator should have 1 dimension");
-    auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
-    PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
-                      "Beta2 power accumulator should have 1 dimension");
-
-    auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Grad"),
-        "Param and Grad input of AdamOp should have same dimension");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Moment1"),
-        "Param and Moment1 input of AdamOp should have same dimension");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Moment2"),
-        "Param and Moment2 input of AdamOp should have same dimension");
-
-    ctx->SetOutputDim("ParamOut", param_dims);
-    ctx->SetOutputDim("Moment1Out", param_dims);
-    ctx->SetOutputDim("Moment2Out", param_dims);
-  }
-};
-
-class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  AdamOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Param", "(Tensor) Input parameter");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("LearningRate", "(Tensor) Learning rate");
-    AddInput("Moment1", "(Tensor) Input first moment");
-    AddInput("Moment2", "(Tensor) Input second moment");
-    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
-    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
-
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-    AddOutput("Moment1Out", "(Tensor) Output first moment");
-    AddOutput("Moment2Out", "(Tensor) Output second moment");
-
-    AddAttr<float>("beta1",
-                   "(float, default 0.9) "
-                   "Exponential decay rate for the "
-                   "first moment estimates.")
-        .SetDefault(0.9f);
-    AddAttr<float>("beta2",
-                   "(float, default 0.999) "
-                   "exponential decay rate for the "
-                   "second moment estimates.")
-        .SetDefault(0.999f);
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-8) "
-                   "Constant for numerical stability")
-        .SetDefault(1.0e-8f);
-
-    AddComment(R"DOC(
-Adam Optimizer.
-
-This implements the Adam optimizer from Section 2 of the Adam
-paper : https://arxiv.org/abs/1412.6980.
-Adam is a first-order gradient-based optimization method based on
-adaptive estimates of lower-order moments.
-
-Adam updates:
-
-$$
-moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
-moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
-learning\_rate = learning\_rate *
-                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
-param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
-$$
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    adam, ops::AdamOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AdamOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adam_op.cu b/paddle/operators/adam_op.cu
deleted file mode 100644
index 94f840c188942a900858429bc621c3a18d5900ad..0000000000000000000000000000000000000000
--- a/paddle/operators/adam_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/adam_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    adam, ops::AdamOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AdamOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h
deleted file mode 100644
index bf536687d398b8342e6ae76a07c11e5fe47483e0..0000000000000000000000000000000000000000
--- a/paddle/operators/adam_op.h
+++ /dev/null
@@ -1,229 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>  // for sqrt in CPU and CUDA
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/detail/safe_ref.h"
-#include "paddle/operators/math/selected_rows_functor.h"
-#include "paddle/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-namespace scatter = paddle::operators::math::scatter;
-
-template <typename T>
-struct AdamFunctor {
-  T beta1_;
-  T beta2_;
-  T epsilon_;
-
-  const T* beta1_pow_;
-  const T* beta2_pow_;
-  const T* moment1_;
-  T* moment1_out_;
-  const T* moment2_;
-  T* moment2_out_;
-  const T* lr_;
-  const T* grad_;
-  const T* param_;
-  T* param_out_;
-
-  AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
-              const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2,
-              T* mom2_out, const T* lr, const T* grad, const T* param,
-              T* param_out)
-      : beta1_(beta1),
-        beta2_(beta2),
-        epsilon_(epsilon),
-        beta1_pow_(beta1_pow),
-        beta2_pow_(beta2_pow),
-        moment1_(mom1),
-        moment1_out_(mom1_out),
-        moment2_(mom2),
-        moment2_out_(mom2_out),
-        lr_(lr),
-        grad_(grad),
-        param_(param),
-        param_out_(param_out) {}
-
-  inline HOSTDEVICE void operator()(size_t i) const {
-    // Merge all memory access together.
-    T g = grad_[i];
-    T mom1 = moment1_[i];
-    T mom2 = moment2_[i];
-    T lr = *lr_;
-    T beta1_pow = *beta1_pow_;
-    T beta2_pow = *beta2_pow_;
-    T p = param_[i];
-
-    // Calculation
-    lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
-    mom1 = beta1_ * mom1 + (1 - beta1_) * g;
-    mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
-    p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
-
-    // Write back to global memory
-    moment1_out_[i] = mom1;
-    moment2_out_[i] = mom2;
-    param_out_[i] = p;
-  }
-};
-
-template <typename T>
-struct SparseAdamFunctor {
-  T beta1_;
-  T beta2_;
-  T epsilon_;
-
-  const T* beta1_pow_;
-  const T* beta2_pow_;
-  const T* moment1_;
-  T* moment1_out_;
-  const T* moment2_;
-  T* moment2_out_;
-  const T* lr_;
-  const T* grad_;
-  const T* param_;
-  T* param_out_;
-
-  const int64_t* rows_;
-  int64_t row_numel_;
-
-  SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
-                    const T* beta2_pow, const T* mom1, T* mom1_out,
-                    const T* mom2, T* mom2_out, const T* lr, const T* grad,
-                    const T* param, T* param_out, const int64_t* rows,
-                    int64_t row_numel)
-      : beta1_(beta1),
-        beta2_(beta2),
-        epsilon_(epsilon),
-        beta1_pow_(beta1_pow),
-        beta2_pow_(beta2_pow),
-        moment1_(mom1),
-        moment1_out_(mom1_out),
-        moment2_(mom2),
-        moment2_out_(mom2_out),
-        lr_(lr),
-        grad_(grad),
-        param_(param),
-        param_out_(param_out),
-        rows_(rows),
-        row_numel_(row_numel) {}
-
-  inline HOSTDEVICE void operator()(size_t i) const {
-    T beta1_pow = *beta1_pow_;
-    T beta2_pow = *beta2_pow_;
-    for (int64_t j = 0; j < row_numel_; ++j) {
-      T g = grad_[i * row_numel_ + j];
-      T mom1 = moment1_[rows_[i] * row_numel_ + j];
-      T mom2 = moment2_[rows_[i] * row_numel_ + j];
-      T lr = *lr_;
-      T p = param_[rows_[i] * row_numel_ + j];
-
-      lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
-      mom1 = beta1_ * mom1 + (1 - beta1_) * g;
-      mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
-      p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
-
-      moment1_out_[rows_[i] * row_numel_ + j] = mom1;
-      moment2_out_[rows_[i] * row_numel_ + j] = mom2;
-      param_out_[rows_[i] * row_numel_ + j] = p;
-    }  // for col id
-  }
-};
-
-template <typename DeviceContext, typename T>
-class AdamOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using paddle::framework::LoDTensor;
-    using paddle::operators::detail::Ref;
-
-    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    auto& param = Ref(ctx.Input<LoDTensor>("Param"), "Must set Param");
-    // auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
-    auto* grad_var = ctx.InputVar("Grad");
-    auto& mom1 = Ref(ctx.Input<LoDTensor>("Moment1"), "Must set Moment1");
-    auto& mom2 = Ref(ctx.Input<LoDTensor>("Moment2"), "Must set Moment2");
-    auto& lr =
-        Ref(ctx.Input<LoDTensor>("LearningRate"), "Must set LearningRate");
-
-    auto& beta1_pow =
-        Ref(ctx.Input<LoDTensor>("Beta1Pow"), "Must set Beta1Pow");
-    auto& beta2_pow =
-        Ref(ctx.Input<LoDTensor>("Beta2Pow"), "Must set Beta2Pow");
-
-    auto& param_out =
-        Ref(ctx.Output<LoDTensor>("ParamOut"), "Must set ParamOut");
-    auto& mom1_out =
-        Ref(ctx.Output<LoDTensor>("Moment1Out"), "Must set Moment1Out");
-    auto& mom2_out =
-        Ref(ctx.Output<LoDTensor>("Moment2Out"), "Must set Moment1Out");
-
-    if (grad_var->IsType<framework::LoDTensor>()) {
-      auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
-      AdamFunctor<T> functor(
-          beta1, beta2, epsilon, beta1_pow.template data<T>(),
-          beta2_pow.template data<T>(), mom1.template data<T>(),
-          mom1_out.template mutable_data<T>(ctx.GetPlace()),
-          mom2.template data<T>(),
-          mom2_out.template mutable_data<T>(ctx.GetPlace()),
-          lr.template data<T>(), grad.template data<T>(),
-          param.template data<T>(),
-          param_out.template mutable_data<T>(ctx.GetPlace()));
-      platform::ForRange<DeviceContext> for_range(
-          static_cast<const DeviceContext&>(ctx.device_context()),
-          param.numel());
-      for_range(functor);
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      auto& grad =
-          Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
-      // merge duplicated rows if any.
-      scatter::MergeAdd<DeviceContext, T> merge_func;
-      auto grad_merge =
-          merge_func(ctx.template device_context<DeviceContext>(), grad);
-      auto& grad_tensor = grad_merge.value();
-      const T* grad_data = grad_tensor.template data<T>();
-      int64_t* rows = nullptr;
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-        rows = grad_merge.mutable_rows()->cuda_data();
-      } else {
-        rows = grad_merge.mutable_rows()->data();
-      }
-      auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
-
-      SparseAdamFunctor<T> functor(
-          beta1, beta2, epsilon, beta1_pow.template data<T>(),
-          beta2_pow.template data<T>(), mom1.template data<T>(),
-          mom1_out.template mutable_data<T>(ctx.GetPlace()),
-          mom2.template data<T>(),
-          mom2_out.template mutable_data<T>(ctx.GetPlace()),
-          lr.template data<T>(), grad_data, param.template data<T>(),
-          param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel);
-      platform::ForRange<DeviceContext> for_range(
-          static_cast<const DeviceContext&>(ctx.device_context()),
-          grad_merge.rows().size());
-      for_range(functor);
-    } else {
-      PADDLE_THROW("Variable type not supported by adam_op");
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc
deleted file mode 100644
index 3b0b71418477ea128dbb31a8d7cd44cf6bf023a1..0000000000000000000000000000000000000000
--- a/paddle/operators/adamax_op.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/adamax_op.h"
-
-namespace paddle {
-namespace operators {
-
-class AdamaxOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of AdamaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of AdamaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment"),
-                   "Input(Moment) of AdamaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("InfNorm"),
-                   "Input(InfNorm) of AdamaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of AdamaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
-                   "Input(Beta1Pow) of AdamaxOp should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of AdamaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
-                   "Output(MomentOut) of AdamaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("InfNormOut"),
-                   "Output(InfNormOut) of AdamaxOp should not be null.");
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning rate should have 1 dimension");
-    auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
-    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
-                      "Beta1 power accumulator should have 1 dimension");
-    auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Grad"),
-        "Param and Grad input of AdamaxOp should have same dimension");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Moment"),
-        "Param and Moment input of AdamaxOp should have same dimension");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("InfNorm"),
-        "Param and InfNorm input of AdamaxOp should have same dimension");
-
-    ctx->SetOutputDim("ParamOut", param_dims);
-    ctx->SetOutputDim("MomentOut", param_dims);
-    ctx->SetOutputDim("InfNormOut", param_dims);
-  }
-};
-
-class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  AdamaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Param", "(Tensor) Input parameter");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("LearningRate", "(Tensor) Learning rate");
-    AddInput("Moment", "(Tensor) First moment");
-    AddInput("InfNorm",
-             "(Tensor) "
-             "Input exponentially weighted infinity norm");
-    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
-
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-    AddOutput("MomentOut", "(Tensor) Output first moment");
-    AddOutput("InfNormOut",
-              "(Tensor) "
-              "Output exponentially weighted infinity norm");
-
-    AddAttr<float>("beta1",
-                   "(float, default 0.9) "
-                   "Exponential decay rate for the "
-                   "1st moment estimates.")
-        .SetDefault(0.9f);
-    AddAttr<float>("beta2",
-                   "(float, default 0.999) "
-                   "exponential decay rate for the weighted "
-                   "infinity norm estimates.")
-        .SetDefault(0.999f);
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-8) "
-                   "Constant for numerical stability")
-        .SetDefault(1.0e-8f);
-    AddComment(R"DOC(
-Adamax Optimizer.
-
-We implement the Adamax optimizer from Section 7 of the Adam
-paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the
-Adam algorithm based on the infinity norm.
-
-Adamax updates:
-
-$$
-moment\_out = \beta_1 * moment + (1 - \beta_1) * grad \\
-inf\_norm\_out = max(\beta_2 * inf\_norm + \epsilon, |grad|) \\
-learning\_rate = \frac{learning\_rate}{1 - \beta_{1\_pow}} \\
-param\_out = param - learning\_rate * \frac{moment\_out}{inf\_norm\_out}
-$$
-
-The original paper does not have an epsilon attribute.
-However, it is added here for numerical stability to prevent the
-division by 0 error.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    adamax, ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adamax_op.cu b/paddle/operators/adamax_op.cu
deleted file mode 100644
index 8f87bb28671018a184f25a014f9bdb7615f3040c..0000000000000000000000000000000000000000
--- a/paddle/operators/adamax_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/adamax_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    adamax, ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adamax_op.h b/paddle/operators/adamax_op.h
deleted file mode 100644
index 172c179c5fabf5ca106bf11479aff2d94a4e21d2..0000000000000000000000000000000000000000
--- a/paddle/operators/adamax_op.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AdamaxOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
-    auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut");
-
-    param_out_tensor->mutable_data<T>(ctx.GetPlace());
-    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
-    inf_norm_out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
-    auto param = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Param"));
-    auto grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Grad"));
-    auto moment = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Moment"));
-    auto inf_norm = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("InfNorm"));
-    auto lr = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("LearningRate"));
-    auto beta1_pow = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Beta1Pow"));
-    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
-    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-    auto inf_norm_out =
-        framework::EigenVector<T>::Flatten(*inf_norm_out_tensor);
-    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-
-    moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad;
-    inf_norm_out.device(*place) =
-        grad.abs().cwiseMax((beta2 * inf_norm) + epsilon);
-    auto lr_t = lr / (1 - beta1_pow);
-    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-    param_out.device(*place) =
-        param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h
deleted file mode 100644
index 3fdad5ad9b1246e06bbb71e582a96c212de1b0d5..0000000000000000000000000000000000000000
--- a/paddle/operators/array_operator.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-class ArrayOp : public framework::OperatorBase {
- public:
-  ArrayOp(const std::string &type, const framework::VariableNameMap &inputs,
-          const framework::VariableNameMap &outputs,
-          const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- protected:
-  size_t GetOffset(const framework::Scope &scope,
-                   const platform::Place &place) const {
-    auto *i = scope.FindVar(Input("I"));
-    PADDLE_ENFORCE(i != nullptr, "I must be set");
-    auto &i_tensor = i->Get<framework::LoDTensor>();
-    PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
-
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    size_t offset;
-    if (platform::is_gpu_place(i_tensor.place())) {
-      // FIXME: Avoid copy from GPU to CPU
-      framework::Tensor t;
-      framework::Copy(i_tensor, platform::CPUPlace(), dev_ctx, &t);
-      dev_ctx.Wait();
-      offset = static_cast<size_t>(*t.data<int64_t>());
-    } else {
-      offset = static_cast<size_t>(*i_tensor.data<int64_t>());
-    }
-    VLOG(10) << " Offset = " << offset;
-    return offset;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc
deleted file mode 100644
index ba5c6bd3c681b4ae4f612da96df866227961df3d..0000000000000000000000000000000000000000
--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <numeric>
-
-#include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/memory/memcpy.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-using LoD = framework::LoD;
-
-class ArrayToLoDTensorOp : public framework::OperatorBase {
- public:
-  ArrayToLoDTensorOp(const std::string &type,
-                     const framework::VariableNameMap &inputs,
-                     const framework::VariableNameMap &outputs,
-                     const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
-    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
-    auto &rank_table =
-        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
-    auto *out =
-        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
-
-    // Check dims, place and data type of input's elements and infer output's
-    // dim
-    PADDLE_ENFORCE(!x.empty(), "There's no element in the input array.");
-    int rank = x[0].dims().size();
-    platform::Place place = x[0].place();
-    std::type_index data_type = x[0].type();
-    framework::DDim ins_dims = framework::slice_ddim(x[0].dims(), 1, rank);
-    int64_t batch_size = x[0].dims()[0];
-    for (size_t i = 1; i < x.size(); ++i) {
-      PADDLE_ENFORCE_EQ(framework::slice_ddim(x[i].dims(), 1, rank), ins_dims,
-                        "The dimension of the %zu'th element in LoDTensorArray "
-                        "differs from previous ones.",
-                        i);
-      PADDLE_ENFORCE(platform::places_are_same_class(x[i].place(), place),
-                     "The place class of the %zu'th element in LoDTensorArray "
-                     "differs from previous ones.",
-                     i);
-      PADDLE_ENFORCE(x[i].type() == data_type,
-                     "The date type of the %zu'th element in LoDTensorArray "
-                     "differs from previous ones.",
-                     i);
-      batch_size += x[i].dims()[0];
-    }
-    auto ins_dim_vec = framework::vectorize(ins_dims);
-    ins_dim_vec.insert(ins_dim_vec.begin(), batch_size);
-    framework::DDim out_dims = framework::make_ddim(ins_dim_vec);
-    out->Resize(out_dims);
-    out->mutable_data(place, data_type);
-
-    auto &table_items = rank_table.items();
-    std::vector<size_t> table_item_idx(table_items.size());
-    // table_item_idx = range(table_items_idx.size())
-    std::iota(table_item_idx.begin(), table_item_idx.end(), 0);
-    std::sort(table_item_idx.begin(), table_item_idx.end(),
-              [&](size_t a, size_t b) {
-                return table_items[a].index < table_items[b].index;
-              });
-
-    // Build LoDTensor `out`
-    framework::LoD *out_lod = out->mutable_lod();
-    out_lod->clear();
-    size_t out_offset = 0;
-    auto prefix_lod = rank_table.coarse_lod();
-    prefix_lod.emplace_back();
-    auto &cur_level_lod = prefix_lod.back();
-    cur_level_lod.push_back(0);
-    for (size_t idx : table_item_idx) {
-      cur_level_lod.push_back(cur_level_lod.back() + table_items[idx].length);
-      for (size_t x_idx = 0; x_idx < table_items[idx].length; ++x_idx) {
-        auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
-            x[x_idx].lod(), idx, idx + 1, 0);
-
-        auto &lod_length = lod_and_offset.first;
-        framework::AppendLoD(out_lod, lod_length);
-
-        size_t start_offset = lod_and_offset.second.first;
-        size_t end_offset = lod_and_offset.second.second;
-        VLOG(10) << "idx=" << idx << " x_idx=" << x_idx << " ["
-                 << ", " << end_offset << "]";
-        // Copy data
-        PADDLE_ENFORCE_GE(end_offset, start_offset);
-        size_t len = end_offset - start_offset;
-        if (len == 0) {
-          continue;
-        }
-        auto slice = out->Slice(out_offset, out_offset + len);
-
-        platform::DeviceContextPool &pool =
-            platform::DeviceContextPool::Instance();
-        auto &dev_ctx = *pool.Get(place);
-
-        framework::Copy(x[x_idx].Slice(start_offset, end_offset), place,
-                        dev_ctx, &slice);
-        out_offset += len;
-      }
-    }
-    out_lod->insert(out_lod->begin(), prefix_lod.begin(), prefix_lod.end());
-  }
-};
-
-class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ArrayToLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(std::vector<LodTensor>) A vector of tensors that is going to "
-             "be casted to a big LoDTensor.");
-    AddInput("RankTable",
-             "(LoDRankTable) RankTable provides the coarse lod infomation to "
-             "build the output LoDTensor. See "
-             "'paddle/framework/lod_rank_table.h' for more details.");
-    AddOutput("Out", "(LoDTensor) The LoDTensor formed by input tensor array.");
-    AddComment(
-        R"DOC(This Op build a big LoDTensor from a std::vector<LoDTensor> 
-          and a LoDRankTable. It is supposed to be used in getting dynamic RNN's
-          outputs back to a normal LoDTensor. The std::vector<LoDTensor> 
-          would be the output of RNN Op and the LoDRankTable would be build 
-          with RNN's input.)DOC");
-  }
-};
-
-class ArrayToLoDTensorInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"),
-                   "ArrayToLoDTensorOp must has input X.");
-    PADDLE_ENFORCE(context->HasInput("RankTable"),
-                   "ArrayToLoDTensorOp must has input RankTable.");
-    context->SetOutputDim("Out", context->GetInputDim("X"));
-  }
-};
-
-class ArrayToLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("lod_tensor_to_array");
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetInput("RankTable", Input("RankTable"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(array_to_lod_tensor, ops::ArrayToLoDTensorOp,
-                  ops::ArrayToLoDTensorOpProtoMaker,
-                  ops::ArrayToLoDTensorInferShape,
-                  ops::ArrayToLoDTensorGradMaker);
diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc
deleted file mode 100644
index e04aa2d28cff7b106b30304bfa19ba18e2affd21..0000000000000000000000000000000000000000
--- a/paddle/operators/assign_op.cc
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/var_type.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-class AssignFunctor {
- public:
-  AssignFunctor(framework::Variable *out,
-                const platform::DeviceContext &dev_ctx)
-      : out_(out), dev_ctx_(dev_ctx) {}
-
-  void operator()(const framework::LoDTensor &lod_tensor) const {
-    auto &out_tensor = *out_->GetMutable<framework::LoDTensor>();
-    copy_tensor(lod_tensor, &out_tensor);
-  }
-
-  void operator()(const framework::LoDTensorArray &array) const {
-    auto &out_array = *out_->GetMutable<framework::LoDTensorArray>();
-    out_array.resize(array.size());
-    for (size_t i = 0; i < array.size(); ++i) {
-      copy_tensor(array[i], &out_array[i]);
-    }
-  }
-
-  void operator()(const framework::SelectedRows &rows) const {
-    framework::SelectedRows &out_rows =
-        *out_->GetMutable<framework::SelectedRows>();
-    out_rows.set_rows(rows.rows());
-    out_rows.set_height(rows.height());
-    auto &t = rows.value();
-    auto *m = out_rows.mutable_value();
-    framework::Copy(t, t.place(), dev_ctx_, m);
-  }
-
-  template <typename T>
-  void operator()(const T &v) const {
-    PADDLE_THROW("Not support type for assign op %s", typeid(T).name());
-  }
-
- private:
-  void copy_tensor(const framework::LoDTensor &lod_tensor,
-                   framework::LoDTensor *out) const {
-    auto &out_tensor = *out;
-    Copy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
-    out_tensor.set_lod(lod_tensor.lod());
-  }
-
-  framework::Variable *out_;
-  const platform::DeviceContext &dev_ctx_;
-};
-
-class AssignOp : public framework::OperatorBase {
- public:
-  AssignOp(const std::string &type, const framework::VariableNameMap &inputs,
-           const framework::VariableNameMap &outputs,
-           const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    auto *x = scope.FindVar(Input("X"));
-    if (x == nullptr) {
-      return;
-    }
-    auto *out = scope.FindVar(Output("Out"));
-    PADDLE_ENFORCE(
-        out != nullptr,
-        "The Output(Out) should not be null if the Input(X) is set.");
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
-  }
-};
-
-class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  AssignOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(LoDTensor, SelectedRows or LoDTensorArray) The input variable "
-             "could be LoDTensor, SelectedRows or LoDTensorArray.")
-        .AsDispensable();
-    AddOutput("Out",
-              "(LoDTensor, SelectedRows or LoDTensorArray) The type of output "
-              "is the same as input X.");
-    AddComment(R"DOC(Assign Operator
-
-Out = X,  when type in [LoDTensor/SelectedRows/LoDTensorArray]
-raise error if the type is not listed above.
-)DOC");
-  }
-};
-
-class AssignInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    if (context->HasInput("X")) {
-      auto type = context->GetInputsVarType("X")[0];
-      if (type == framework::proto::VarDesc_VarType_SELECTED_ROWS ||
-          type == framework::proto::VarDesc_VarType_LOD_TENSOR) {
-        context->SetOutputDim("Out", context->GetInputDim("X"));
-      }
-    }
-  }
-};
-
-class AssignGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *op = new framework::OpDesc();
-    op->SetType("assign");
-    op->SetInput("X", OutputGrad("Out"));
-    op->SetOutput("Out", InputGrad("X"));
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(assign, ops::AssignOp, ops::AssignGradMaker,
-                  ops::AssignInferShape, ops::AssignOpProtoMaker);
diff --git a/paddle/operators/assign_value_op.cc b/paddle/operators/assign_value_op.cc
deleted file mode 100644
index 8e3a53048920d9875f1b1a178b367cf02b2c9cf8..0000000000000000000000000000000000000000
--- a/paddle/operators/assign_value_op.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/operators/assign_value_op.h"
-
-namespace paddle {
-namespace operators {
-
-class AssignValueOp : public framework::OperatorWithKernel {
- public:
-  AssignValueOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of AssignValueOp should not be null.");
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    ctx->SetOutputDim("Out", framework::make_ddim(shape));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::proto::DataType(ctx.Attr<int>("dtype")), ctx.GetPlace());
-  }
-};
-
-class AssignValueOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  AssignValueOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddOutput("Out", "(Tensor) Output tensor of assign_value operator.");
-    AddAttr<std::vector<int>>("shape",
-                              "(vector<int>) "
-                              "Shape of values.");
-    AddAttr<int>("dtype", "data type of values")
-        .InEnum({framework::proto::DataType::INT32,
-                 framework::proto::DataType::FP32});
-    AddAttr<std::vector<float>>("fp32_values", "store the float values")
-        .SetDefault({});
-    AddAttr<std::vector<int>>("int32_values", "store the int values")
-        .SetDefault({});
-    AddComment(R"DOC(
-AssignValue operator
-
-$$Out = values$$
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker);
-REGISTER_OP_CPU_KERNEL(assign_value, ops::AssignValueKernel<int>,
-                       ops::AssignValueKernel<float>);
diff --git a/paddle/operators/assign_value_op.cu.cc b/paddle/operators/assign_value_op.cu.cc
deleted file mode 100644
index b17e20150053cea4c6b9ed6a5f222f77f4a4bd36..0000000000000000000000000000000000000000
--- a/paddle/operators/assign_value_op.cu.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/assign_value_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
-                        ops::AssignValueKernel<float>);
diff --git a/paddle/operators/assign_value_op.h b/paddle/operators/assign_value_op.h
deleted file mode 100644
index ec98c535132e454958e38c385f7da4df404fab50..0000000000000000000000000000000000000000
--- a/paddle/operators/assign_value_op.h
+++ /dev/null
@@ -1,50 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class AssignValueKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto shape = ctx.Attr<std::vector<int>>("shape");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    int dtype = ctx.Attr<int>("dtype");
-    const char* value_name = nullptr;
-    switch (dtype) {
-      case framework::proto::DataType::INT32:
-        value_name = "int32_values";
-        break;
-      case framework::proto::DataType::FP32:
-        value_name = "fp32_values";
-        break;
-      default:
-        PADDLE_THROW("Unsupported dtype for assign_value_op: %d", dtype);
-        break;
-    }
-    auto values = ctx.Attr<std::vector<T>>(value_name);
-    framework::CopyFromVector(values, ctx.device_context(), out);
-    out->Resize(framework::make_ddim(shape));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/auc_op.cc b/paddle/operators/auc_op.cc
deleted file mode 100644
index b6494f95097bdc87081950815e910beda5d6850d..0000000000000000000000000000000000000000
--- a/paddle/operators/auc_op.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/auc_op.h"
-
-namespace paddle {
-namespace operators {
-
-class AucOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Indices"),
-                   "Input of Indices should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"),
-                   "Input of Label should not be null.");
-    auto inference_height = ctx->GetInputDim("Out")[0];
-    auto label_height = ctx->GetInputDim("Label")[0];
-
-    PADDLE_ENFORCE_EQ(inference_height, label_height,
-                      "Out and Label should have same height.");
-
-    ctx->SetOutputDim("AUC", {1});
-    ctx->ShareLoD("Out", /*->*/ "AUC");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
-        ctx.device_context());
-  }
-};
-
-class AucOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  AucOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Out",
-             "A floating point 2D tensor, values are in the range [0, 1]."
-             "Each row is sorted in descending order. This input should be the"
-             "output of topk."
-             "Typically, this tensor indicates the probability of each label");
-    AddInput("Indices",
-             "An int 2D tensor, indicating the indices of original"
-             "tensor before sorting. Typically, this tensor indicates which "
-             "label the probability stands for.");
-    AddInput("Label",
-             "A 2D int tensor indicating the label of the training data."
-             "The height is batch size and width is always 1.");
-    // TODO(typhoonzero): support weight input
-    AddOutput("AUC",
-              "A scalar representing the "
-              "current area-under-the-curve.");
-
-    AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
-        .SetDefault("ROC");
-    AddAttr<int>("num_thresholds",
-                 "The number of thresholds to use when discretizing the"
-                 " roc curve.")
-        .SetDefault(200);
-
-    AddComment(R"DOC(
-Area Under The Curve (AUC) Operator.
-
-This implementation computes the AUC according to forward output and label.
-It is used very widely in binary classification evaluation. As a note:
-If input label contains values other than 0 and 1, it will be cast
-to bool. You can find the relevant definitions here:
-https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
-
-There are two types of possible curves:
-1. ROC: Receiver operating characteristic
-2. PR: Precision Recall
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker);
-REGISTER_OP_CPU_KERNEL(auc, ops::AucKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/auc_op.h b/paddle/operators/auc_op.h
deleted file mode 100644
index b80509e2a99a2a255dff2a98d950257588a21d29..0000000000000000000000000000000000000000
--- a/paddle/operators/auc_op.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class AucKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<Tensor>("Out");
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* auc = ctx.Output<Tensor>("AUC");
-
-    float* auc_data = auc->mutable_data<float>(ctx.GetPlace());
-
-    std::string curve = ctx.Attr<std::string>("curve");
-    int num_thresholds = ctx.Attr<int>("num_thresholds");
-    std::vector<float> thresholds_list;
-    thresholds_list.reserve(num_thresholds);
-    for (int i = 1; i < num_thresholds - 1; i++) {
-      thresholds_list[i] = (float)i / (num_thresholds - 1);
-    }
-    const float kEpsilon = 1e-7;
-    thresholds_list[0] = 0.0f - kEpsilon;
-    thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon;
-
-    size_t batch_size = inference->dims()[0];
-    size_t inference_width = inference->dims()[1];
-
-    const T* inference_data = inference->data<T>();
-    const int64_t* label_data = label->data<int64_t>();
-
-    // Create local tensor for storing the curve: TP, FN, TN, FP
-    // TODO(typhoonzero): use eigen op to caculate these values.
-    Tensor true_positive, false_positive, true_negative, false_negative;
-
-    true_positive.Resize({num_thresholds});
-    false_negative.Resize({num_thresholds});
-    true_negative.Resize({num_thresholds});
-    false_positive.Resize({num_thresholds});
-
-    int64_t* tp_data = true_positive.mutable_data<int64_t>(ctx.GetPlace());
-    int64_t* fn_data = false_negative.mutable_data<int64_t>(ctx.GetPlace());
-    int64_t* tn_data = true_negative.mutable_data<int64_t>(ctx.GetPlace());
-    int64_t* fp_data = false_positive.mutable_data<int64_t>(ctx.GetPlace());
-
-    for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
-      // caculate TP, FN, TN, FP for current thresh
-      int64_t tp = 0, fn = 0, tn = 0, fp = 0;
-      for (size_t i = 0; i < batch_size; i++) {
-        // NOTE: label_data used as bool, labels >0 will be treated as true.
-        if (label_data[i]) {
-          // use first(max) data in each row
-          if (inference_data[i * inference_width] >=
-              (thresholds_list[idx_thresh])) {
-            tp++;
-          } else {
-            fn++;
-          }
-        } else {
-          if (inference_data[i * inference_width] >=
-              (thresholds_list[idx_thresh])) {
-            fp++;
-          } else {
-            tn++;
-          }
-        }
-      }
-      // store rates
-      tp_data[idx_thresh] = tp;
-      fn_data[idx_thresh] = fn;
-      tn_data[idx_thresh] = tn;
-      fp_data[idx_thresh] = fp;
-    }
-    // epsilon to avoid divide by zero.
-    float epsilon = 1e-6;
-    // Riemann sum to caculate auc.
-    Tensor tp_rate, fp_rate, rec_rate;
-    tp_rate.Resize({num_thresholds});
-    fp_rate.Resize({num_thresholds});
-    rec_rate.Resize({num_thresholds});
-    float* tp_rate_data = tp_rate.mutable_data<float>(ctx.GetPlace());
-    float* fp_rate_data = fp_rate.mutable_data<float>(ctx.GetPlace());
-    float* rec_rate_data = rec_rate.mutable_data<float>(ctx.GetPlace());
-    for (int i = 0; i < num_thresholds; i++) {
-      tp_rate_data[i] =
-          ((float)tp_data[i] + epsilon) / (tp_data[i] + fn_data[i] + epsilon);
-      fp_rate_data[i] = (float)fp_data[i] / (fp_data[i] + tn_data[i] + epsilon);
-      rec_rate_data[i] =
-          ((float)tp_data[i] + epsilon) / (tp_data[i] + fp_data[i] + epsilon);
-    }
-    *auc_data = 0.0f;
-    if (curve == "ROC") {
-      for (int i = 0; i < num_thresholds - 1; i++) {
-        auto dx = fp_rate_data[i] - fp_rate_data[i + 1];
-        auto y = (tp_rate_data[i] + tp_rate_data[i + 1]) / 2.0f;
-        *auc_data = *auc_data + dx * y;
-      }
-    } else if (curve == "PR") {
-      for (int i = 1; i < num_thresholds; i++) {
-        auto dx = tp_rate_data[i] - tp_rate_data[i - 1];
-        auto y = (rec_rate_data[i] + rec_rate_data[i - 1]) / 2.0f;
-        *auc_data = *auc_data + dx * y;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
deleted file mode 100644
index 0e984c38ba78bddc232ce43bd0982408e837abe3..0000000000000000000000000000000000000000
--- a/paddle/operators/batch_norm_op.cc
+++ /dev/null
@@ -1,448 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/batch_norm_op.h"
-#include "paddle/framework/data_layout.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DataLayout = framework::DataLayout;
-
-template <typename T>
-using EigenArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
-
-class BatchNormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "");
-    PADDLE_ENFORCE(ctx->HasInput("Scale"), "");
-    PADDLE_ENFORCE(ctx->HasInput("Bias"), "");
-    PADDLE_ENFORCE(ctx->HasInput("Mean"), "");
-    PADDLE_ENFORCE(ctx->HasInput("Variance"), "");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"), "");
-    PADDLE_ENFORCE(ctx->HasOutput("MeanOut"), "");
-    PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"), "");
-    PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), "");
-    PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), "");
-
-    // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
-    PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
-                      "Mean and MeanOut should share the same memory");
-    PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0],
-                      ctx->Outputs("VarianceOut")[0],
-                      "Variance and VarianceOut should share the same memory");
-
-    const auto x_dims = ctx->GetInputDim("X");
-    const DataLayout data_layout = framework::StringToDataLayout(
-        ctx->Attrs().Get<std::string>("data_layout"));
-
-    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
-                   "Input X must have 2 to 5 dimensions.");
-
-    const int64_t C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C);
-
-    ctx->SetOutputDim("Y", x_dims);
-    ctx->SetOutputDim("MeanOut", {C});
-    ctx->SetOutputDim("VarianceOut", {C});
-    ctx->SetOutputDim("SavedMean", {C});
-    ctx->SetOutputDim("SavedVariance", {C});
-    ctx->ShareLoD("X", "Y");
-  }
-};
-
-class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  BatchNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<bool>("is_test", "").SetDefault(false);
-    AddAttr<float>("momentum", "").SetDefault(0.9);
-    AddAttr<float>("epsilon", "")
-        .SetDefault(1e-5)
-        .AddCustomChecker([](const float &epsilon) {
-          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
-                         "'epsilon' should be between 0.0 and 0.001.");
-        });
-    AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
-    AddInput("X", "The input tensor");
-    AddInput("Scale",
-             "Scale is a 1-dimensional tensor of size C "
-             "that is applied to the output");
-    AddInput("Bias",
-             "Bias is a 1-dimensional tensor of size C "
-             "that is applied to the output");
-    AddInput("Mean",
-             "The global mean (for training) or "
-             "estimated mean (for testing)");
-    AddInput("Variance",
-             "The global variance (for training) "
-             "or estimated Variance (for testing)");
-    AddOutput("Y", "result after normalization");
-    AddOutput("MeanOut",
-              "Share memory with Mean. "
-              "Store the global mean when training");
-    AddOutput("VarianceOut",
-              "Share memory with Variance. "
-              "Store the global Variance when training");
-    AddOutput("SavedMean",
-              "Mean of the current mini batch, "
-              "will apply to output when training")
-        .AsIntermediate();
-    AddOutput("SavedVariance",
-              "Variance of the current mini batch, "
-              "will apply to output when training")
-        .AsIntermediate();
-    AddComment(R"DOC(
-Batch Normalization.
-
-Batch Norm has been implemented as discussed in the paper:
-https://arxiv.org/pdf/1502.03167.pdf
-Can be used as a normalizer function for conv2d and fully_connected operations.
-The required data format for this layer is one of the following:
-1. NHWC `[batch, in_height, in_width, in_channels]`
-2. NCHW `[batch, in_channels, in_height, in_width]`
-
-)DOC");
-  }
-};
-
-template <typename T>
-class BatchNormKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    const float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
-                   "The Input dim size should be between 2 and 5");
-    const int N = x_dims[0];
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int sample_size = x->numel() / N / C;
-
-    auto *y = ctx.Output<Tensor>("Y");
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-
-    // alloc memory
-    y->mutable_data<T>(ctx.GetPlace());
-    mean_out->mutable_data<T>(ctx.GetPlace());
-    variance_out->mutable_data<T>(ctx.GetPlace());
-    saved_mean->mutable_data<T>(ctx.GetPlace());
-    saved_variance->mutable_data<T>(ctx.GetPlace());
-
-    if (!is_test) {
-      // saved_xx is use just in this batch of data
-      EigenVectorArrayMap<T> saved_mean_e(
-          saved_mean->mutable_data<T>(ctx.GetPlace()), C);
-      EigenVectorArrayMap<T> saved_variance_e(
-          saved_variance->mutable_data<T>(ctx.GetPlace()), C);
-      saved_mean_e.setZero();
-      saved_variance_e.setZero();
-
-      switch (data_layout) {
-        case DataLayout::kNCHW: {
-          ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
-          for (int nc = 0; nc < N * C; ++nc) {
-            saved_mean_e(nc % C) += x_arr.col(nc).sum();
-          }
-          saved_mean_e /= N * sample_size;
-          for (int nc = 0; nc < N * C; ++nc) {
-            saved_variance_e(nc % C) +=
-                (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
-          }
-          saved_variance_e /= N * sample_size;
-          break;
-        }
-        case DataLayout::kNHWC: {
-          ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
-          for (int i = 0; i < N * sample_size; ++i) {
-            saved_mean_e += x_arr.col(i);
-          }
-          saved_mean_e /= N * sample_size;
-          for (int i = 0; i < N * sample_size; ++i) {
-            saved_variance_e +=
-                (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e);
-          }
-          saved_variance_e /= N * sample_size;
-          break;
-        }
-        default:
-          PADDLE_THROW("Unknown storage order: %s", data_layout_str);
-      }
-
-      EigenVectorArrayMap<T> running_mean_arr(
-          mean_out->mutable_data<T>(ctx.GetPlace()), C);
-      EigenVectorArrayMap<T> running_var_arr(
-          variance_out->mutable_data<T>(ctx.GetPlace()), C);
-      running_mean_arr =
-          running_mean_arr * momentum + saved_mean_e * (1. - momentum);
-      running_var_arr =
-          running_var_arr * momentum + saved_variance_e * (1. - momentum);
-    }
-
-    // use SavedMean and SavedVariance to do normalize
-    Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
-    if (is_test) {
-      ConstEigenVectorArrayMap<T> var_arr(
-          ctx.Input<Tensor>("Variance")->data<T>(), C);
-      inv_std = (var_arr + epsilon).sqrt().inverse();
-    } else {
-      EigenVectorArrayMap<T> saved_inv_std(
-          ctx.Output<Tensor>("SavedVariance")->data<T>(), C);
-      // inverse SavedVariance first, gradient will use it too.
-      saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt();
-      inv_std = saved_inv_std;
-    }
-    ConstEigenVectorArrayMap<T> mean_arr(
-        is_test ? ctx.Input<Tensor>("Mean")->data<T>()
-                : ctx.Output<Tensor>("SavedMean")->data<T>(),
-        C);
-
-    //   ((x - est_mean) * (inv_var) * scale + bias
-    //   formula transform ====>
-    //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
-    ConstEigenVectorArrayMap<T> bias_arr(bias->data<T>(), C);
-    Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
-    Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
-        bias_arr - mean_arr * inv_std * scale_arr;
-
-    switch (data_layout) {
-      case DataLayout::kNCHW: {
-        EigenArrayMap<T> y_arr(y->mutable_data<T>(ctx.GetPlace()), sample_size,
-                               N * C);
-        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
-        for (int nc = 0; nc < N * C; ++nc) {
-          y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
-        }
-        break;
-      }
-      case DataLayout::kNHWC: {
-        EigenArrayMap<T>(y->mutable_data<T>(ctx.GetPlace()), C,
-                         N * sample_size) =
-            (ConstEigenArrayMap<T>(x->data<T>(), C, N * sample_size).colwise() *
-             new_scale)
-                .colwise() +
-            new_bias;
-        break;
-      }
-      default:
-        PADDLE_THROW("Unknown storage order: %d", data_layout);
-    }
-  }
-};
-
-class BatchNormGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    // check input
-    PADDLE_ENFORCE(ctx->HasInput("X"));
-    PADDLE_ENFORCE(ctx->HasInput("Scale"), "");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), "");
-    PADDLE_ENFORCE(ctx->HasInput("SavedMean"), "");
-    PADDLE_ENFORCE(ctx->HasInput("SavedVariance"), "");
-
-    // check output
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Scale")), "");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), "");
-
-    const auto x_dims = ctx->GetInputDim("X");
-    const DataLayout data_layout = framework::StringToDataLayout(
-        ctx->Attrs().Get<std::string>("data_layout"));
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
-    ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
-    if (var == nullptr) {
-      PADDLE_THROW("can't find Y@GRAD");
-    }
-    const Tensor *t = nullptr;
-    if (var->IsType<Tensor>()) {
-      t = &var->Get<Tensor>();
-    } else if (var->IsType<LoDTensor>()) {
-      t = &var->Get<LoDTensor>();
-    }
-    if (t == nullptr) {
-      PADDLE_THROW("can't find Y@GRAD");
-    }
-    return framework::OpKernelType(framework::ToDataType(t->type()),
-                                   ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class BatchNormGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    // SavedVariance have been reverted in forward operator
-    const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-
-    // Get the size for each dimension.
-    // NCHW [batch_size, in_channels, in_height, in_width]
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
-                   "The Input dim size should be between 2 and 5");
-    const int N = x_dims[0];
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int sample_size = x->numel() / N / C;
-
-    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
-    ConstEigenVectorArrayMap<T> mean_arr(saved_mean->data<T>(), C);
-    ConstEigenVectorArrayMap<T> inv_var_arr(saved_inv_variance->data<T>(), C);
-
-    // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    d_scale->mutable_data<T>(ctx.GetPlace());
-    d_bias->mutable_data<T>(ctx.GetPlace());
-
-    // d_bias = np.sum(d_y, axis=0)
-    // d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
-    // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
-    //   - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
-
-    EigenVectorArrayMap<T> d_bias_arr(d_bias->mutable_data<T>(ctx.GetPlace()),
-                                      C);
-    EigenVectorArrayMap<T> d_scale_arr(d_scale->mutable_data<T>(ctx.GetPlace()),
-                                       C);
-
-    d_bias_arr.setZero();
-    d_scale_arr.setZero();
-
-    const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size);
-
-    switch (data_layout) {
-      case DataLayout::kNCHW: {
-        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
-        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
-        EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()),
-                                 sample_size, N * C);
-        d_x_arr.setZero();
-
-        for (int nc = 0; nc < N * C; ++nc) {
-          int c = nc % C;
-          d_bias_arr(c) += d_y_arr.col(nc).sum();
-          d_scale_arr(c) +=
-              ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
-                  .sum();
-        }
-        for (int nc = 0; nc < N * C; ++nc) {
-          int c = nc % C;
-          d_x_arr.col(nc) +=
-              scale_inv_var_nhw(c) *
-              (d_y_arr.col(nc) * N * sample_size - d_bias_arr(c) -
-               (x_arr.col(nc) - mean_arr[c]) * d_scale_arr(c) * inv_var_arr(c));
-        }
-        break;
-      }
-      case DataLayout::kNHWC: {
-        ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
-        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
-        EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()), C,
-                                 N * sample_size);
-        d_x_arr.setZero();
-
-        const auto d_y_row_sum = d_y_arr.rowwise().sum();
-        const auto x_minus_mean = x_arr.colwise() - mean_arr;
-        const auto d_y_mul_x_minus_mean_row_sum =
-            (d_y_arr * x_minus_mean).rowwise().sum();
-        const auto inv_var_sqr = inv_var_arr * inv_var_arr;
-        for (int nhw = 0; nhw < N * sample_size; ++nhw) {
-          d_bias_arr += d_y_arr.col(nhw);
-          d_scale_arr +=
-              (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
-          d_x_arr.col(nhw) +=
-              scale_inv_var_nhw *
-              (d_y_arr.col(nhw) * N * sample_size - d_y_row_sum -
-               x_minus_mean.col(nhw) * inv_var_sqr *
-                   d_y_mul_x_minus_mean_row_sum);
-        }
-        break;
-      }
-      default:
-        PADDLE_THROW("Unknown storage order: %s", data_layout_str);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
-            batch_norm_grad, ops::BatchNormGradOp);
-REGISTER_OP_CPU_KERNEL(
-    batch_norm,
-    ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/batch_norm_op.cu.cc b/paddle/operators/batch_norm_op.cu.cc
deleted file mode 100644
index 3d17725ab47682355b2093782848849857f9bf59..0000000000000000000000000000000000000000
--- a/paddle/operators/batch_norm_op.cu.cc
+++ /dev/null
@@ -1,278 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/batch_norm_op.h"
-#include "paddle/framework/data_layout.h"
-
-#include <cfloat>
-#include "paddle/operators/math/math_function.h"
-#include "paddle/platform/cudnn_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using DataLayout = framework::DataLayout;
-template <typename T>
-using CudnnDataType = platform::CudnnDataType<T>;
-
-void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout,
-                  int *N, int *C, int *H, int *W, int *D) {
-  *N = dims[0];
-  if (dims.size() == 2) {
-    *C = dims[1];
-    *H = 1;
-    *W = 1;
-    *D = 1;
-  } else {
-    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
-    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
-    *W = dims.size() > 3
-             ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
-             : 1;
-    *D = dims.size() > 4
-             ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
-             : 1;
-  }
-}
-
-template <typename T>
-class BatchNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-
-    // Get the size for each dimension.
-    // NCHW [batch_size, in_channels, in_height, in_width]
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
-                   "The Input dim size should be between 2 and 5");
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
-
-    // ------------------- cudnn descriptors ---------------------
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t bn_param_desc_;
-    cudnnBatchNormMode_t mode_;
-
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
-
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-#if CUDNN_VERSION_MIN(7, 0, 0)
-    mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-#else
-    mode_ = CUDNN_BATCHNORM_SPATIAL;
-#endif
-
-    VLOG(1) << "Setting descriptors.";
-    std::vector<int> dims;
-    std::vector<int> strides;
-    if (data_layout == DataLayout::kNCHW) {
-      dims = {N, C, H, W, D};
-      strides = {C * H * W * D, H * W * D, W * D, D, 1};
-    } else {
-      dims = {N, C, H, W, D};
-      strides = {H * W * D * C, 1, W * D * C, D * C, C};
-    }
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        bn_param_desc_, data_desc_, mode_));
-
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-
-    auto *y = ctx.Output<Tensor>("Y");
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-
-    // alloc memory
-    y->mutable_data<T>(ctx.GetPlace());
-    mean_out->mutable_data<T>(ctx.GetPlace());
-    variance_out->mutable_data<T>(ctx.GetPlace());
-    saved_mean->mutable_data<T>(ctx.GetPlace());
-    saved_variance->mutable_data<T>(ctx.GetPlace());
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    math::SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(dev_ctx, saved_mean, 0);
-    functor(dev_ctx, saved_variance, 0);
-
-    auto handle = dev_ctx.cudnn_handle();
-
-    // Now, depending on whether we are running test or not, we have two paths.
-    if (is_test) {
-      // only when test we use input to do computation.
-      const auto *est_mean = ctx.Input<Tensor>("Mean");
-      const auto *est_var = ctx.Input<Tensor>("Variance");
-      // Run inference mode.
-      PADDLE_ENFORCE_EQ(est_mean->dims().size(), 1UL);
-      PADDLE_ENFORCE_EQ(est_var->dims().size(), 1UL);
-      PADDLE_ENFORCE_EQ(est_mean->dims()[0], C);
-      PADDLE_ENFORCE_EQ(est_var->dims()[0], C);
-
-      CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardInference(
-          handle,
-          // Note: PERSISTENT not implemented for inference
-          CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
-          CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
-          data_desc_, y->template mutable_data<T>(ctx.GetPlace()),
-          bn_param_desc_, scale->template data<T>(), bias->template data<T>(),
-          est_mean->template data<T>(), est_var->template data<T>(), epsilon));
-    } else {
-      // Run training mode.
-      // obtain running mean and running inv var, and see if we need to
-      // initialize them.
-      double this_factor = 1. - momentum;
-
-      CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
-          handle, mode_, CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
-          data_desc_, x->template data<T>(), data_desc_,
-          y->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
-          scale->template data<T>(), bias->template data<T>(), this_factor,
-          mean_out->template mutable_data<T>(ctx.GetPlace()),
-          variance_out->template mutable_data<T>(ctx.GetPlace()), epsilon,
-          saved_mean->template mutable_data<T>(ctx.GetPlace()),
-          saved_variance->template mutable_data<T>(ctx.GetPlace())));
-    }
-
-    // clean when exit.
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
-  }
-};
-
-template <typename T>
-class BatchNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-
-    const auto &x_dims = x->dims();
-
-    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
-                   "The Input dim size should be between 2 and 5");
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
-
-    PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
-    PADDLE_ENFORCE_EQ(scale->dims()[0], C);
-
-    // ------------------- cudnn descriptors ---------------------
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t bn_param_desc_;
-    cudnnBatchNormMode_t mode_;
-
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-#if CUDNN_VERSION_MIN(7, 0, 0)
-    mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-#else
-    mode_ = CUDNN_BATCHNORM_SPATIAL;
-#endif
-
-    std::vector<int> dims;
-    std::vector<int> strides;
-    if (data_layout == DataLayout::kNCHW) {
-      dims = {N, C, H, W, D};
-      strides = {C * H * W * D, H * W * D, W * D, D, 1};
-    } else {
-      dims = {N, C, H, W, D};
-      strides = {H * W * C * D, 1, W * D * C, D * C, C};
-    }
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        bn_param_desc_, data_desc_, mode_));
-
-    // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    d_scale->mutable_data<T>(ctx.GetPlace());
-    d_bias->mutable_data<T>(ctx.GetPlace());
-
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
-    const void *saved_mean_data = saved_mean->template data<T>();
-    const void *saved_var_data = saved_var->template data<T>();
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
-        dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-        CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-        CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
-        data_desc_, d_y->template data<T>(), data_desc_,
-        d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
-        scale->template data<T>(),
-        d_scale->template mutable_data<T>(ctx.GetPlace()),
-        d_bias->template mutable_data<T>(ctx.GetPlace()), epsilon,
-        saved_mean_data, saved_var_data));
-
-    // clean when exit.
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm,
-    ops::BatchNormKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/batch_norm_op.h b/paddle/operators/batch_norm_op.h
deleted file mode 100644
index a817ef41fc87da33ad87923c99a75ee7c3c7bbfe..0000000000000000000000000000000000000000
--- a/paddle/operators/batch_norm_op.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class BatchNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class BatchNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/beam_search_decode_op.cc b/paddle/operators/beam_search_decode_op.cc
deleted file mode 100644
index 72e05607b0b612807d552b4c45b58f9d9ce9c2af..0000000000000000000000000000000000000000
--- a/paddle/operators/beam_search_decode_op.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/beam_search_decode_op.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-struct BeamSearchDecodeFunctor {
-  BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
-                          const LoDTensorArray& step_scores,
-                          LoDTensor* id_tensor, LoDTensor* score_tensor)
-      : step_ids_(step_ids),
-        step_scores_(step_scores),
-        id_tensor_(id_tensor),
-        score_tensor_(score_tensor) {}
-
-  template <typename T>
-  void operator()() const;
-
-  const LoDTensorArray& step_ids_;
-  const LoDTensorArray& step_scores_;
-  LoDTensor* id_tensor_;
-  LoDTensor* score_tensor_;
-};
-
-template <typename T>
-void BeamSearchDecodeFunctor::operator()() const {
-  BeamSearchDecoder<T> beam_search_decoder;
-  beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_,
-                                   score_tensor_);
-}
-
-template <>
-void BeamSearchDecodeFunctor::operator()<bool>() const {
-  PADDLE_THROW("beam search decode op does not support bool!");
-}
-
-class BeamSearchDecodeOp : public framework::OperatorBase {
- public:
-  BeamSearchDecodeOp(const std::string& type,
-                     const framework::VariableNameMap& inputs,
-                     const framework::VariableNameMap& outputs,
-                     const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& dev_ctx = *pool.Get(dev_place);
-
-    framework::ExecutionContext ctx(*this, scope, dev_ctx);
-
-    const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
-    const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores");
-    const size_t step_num = ids->size();
-    PADDLE_ENFORCE_GT(step_num, 0UL,
-                      "beam search steps should be larger than 0");
-    const size_t source_num = ids->at(0).lod().at(0).size() - 1;
-    PADDLE_ENFORCE_GT(source_num, 0UL, "source num should be larger than 0");
-
-    for (size_t i = 0; i < step_num; ++i) {
-      PADDLE_ENFORCE_EQ(ids->at(i).lod().size(), 2UL,
-                        "Level of LodTensor should be 2");
-    }
-
-    // prepare output
-    LoDTensor* sentenceIds = ctx.Output<LoDTensor>("SentenceIds");
-    LoDTensor* sentenceScores = ctx.Output<LoDTensor>("SentenceScores");
-
-    framework::VisitDataType(
-        framework::ToDataType(scores->at(0).type()),
-        BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores));
-  }
-};
-
-class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  BeamSearchDecodeOpProtoMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Ids",
-             "(LodTensorArray)"
-             "score of the candidate words in each step");
-    AddInput("Scores",
-             "(LodTensorArray)"
-             "score of the candidate words in each step");
-    AddOutput("SentenceIds",
-              "(LodTensor)"
-              "All possible result sentences of word ids");
-    AddOutput("SentenceScores",
-              "(LodTensor)"
-              "All possible result sentences of word scores");
-    AddComment(R"DOC(
-Pack the result of Beam search op into SentenceIds and SentenceScores.
-)DOC");
-  }
-};
-
-class BeamSearchDecodeInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* context) const override {
-    PADDLE_ENFORCE(context->HasInput("Ids"),
-                   "BeamSearchDecodeOp must has input Ids");
-    PADDLE_ENFORCE(context->HasInput("Scores"),
-                   "BeamSearchDecodeOp must has input Scores");
-    PADDLE_ENFORCE(context->HasOutput("SentenceIds"),
-                   "BeamSearchDecodeOp must has output SentenceIds");
-    PADDLE_ENFORCE(context->HasOutput("SentenceScores"),
-                   "BeamSearchDecodeOp must has output SentenceScores");
-  }
-};
-
-class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    for (auto& o : op_desc.Output("SentenceIds")) {
-      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
-    }
-    for (auto& o : op_desc.Output("SentenceScores")) {
-      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(beam_search_decode, paddle::operators::BeamSearchDecodeOp,
-                  paddle::operators::BeamSearchDecodeOpProtoMaker,
-                  paddle::operators::BeamSearchDecodeInferShape,
-                  paddle::operators::BeamSearchDecodeInferVarType,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/beam_search_decode_op.h b/paddle/operators/beam_search_decode_op.h
deleted file mode 100644
index 3b1c6cd7a1045bfbb896725c79dc1ae2e22f43dc..0000000000000000000000000000000000000000
--- a/paddle/operators/beam_search_decode_op.h
+++ /dev/null
@@ -1,280 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using LoDTensorArray = framework::LoDTensorArray;
-
-// all the lod have 2 levels.
-// The First is source level, the second is sentence level.
-// source level describe how many candidate words for this source.
-// sentence level describe these candidates belong to which prefix
-const size_t kSourceLevel = 0;
-const size_t kSentenceLevel = 1;
-
-template <typename T>
-struct BeamNode {
-  BeamNode(int64_t word_id, T score) : word_id_(word_id), score_(score) {}
-
-  ~BeamNode() {
-    if (parent_) {
-      parent_->DropKid(this);
-      if (parent_->kids_.size() == 0UL) {
-        delete parent_;
-      }
-    }
-    VLOG(3) << "Delete BeamNode root with word_id:" << this->word_id_;
-  }
-
-  void AppendTo(BeamNode* parent) {
-    parent_ = parent;
-    parent->kids_.insert(this);
-  }
-
-  void DropKid(BeamNode* kid) { kids_.erase(kid); }
-
-  BeamNode* parent_ = nullptr;
-  std::unordered_set<BeamNode*> kids_;
-  int64_t word_id_;
-  T score_;
-};
-
-template <typename T>
-using BeamNodeVector = std::vector<std::unique_ptr<BeamNode<T>>>;
-
-template <typename T>
-struct Sentence {
-  std::vector<int64_t> word_ids;
-  std::vector<T> scores;
-};
-
-template <typename T>
-using SentenceVector = std::vector<Sentence<T>>;
-
-template <typename T>
-struct BeamSearchDecoder {
-  /**
-   * make a BeamNode and all it's related prefix BeanNode into a Sentence.
-   */
-  Sentence<T> MakeSentence(const BeamNode<T>* node) const;
-
-  /**
-   * Param:
-   *  cur_ids: LoDTensor of One step for word ID
-   *  cur_scores: LoDTensor of One Step for word score
-   *  prefixes_list: prefixes for each source sentence.
-   *  sentence_vector_list: result sentence_vector for each source sentence.
-   * Return:
-   *  a new prefixes list for each source of current step
-   */
-  std::vector<BeamNodeVector<T>> PackTwoSteps(
-      const LoDTensor& cur_ids, const LoDTensor& cur_scores,
-      std::vector<BeamNodeVector<T>>& prefixes_list,
-      std::vector<SentenceVector<T>>* sentence_vector_list) const;
-
-  /**
-   * convert the result sentence_vector for each source sentence into two
-   * LodTensor.
-   * One is all candidate sentences with word id, one is all candidate sentences
-   * with word score.
-   * Param:
-   *  sentence_vector_list: sentence_vector for each source sentence.
-   *  id_tensor: result LoDTensor for sentences of id.
-   *  score_tensor: result LoDTensor for sentences of score.
-   */
-  void ConvertSentenceVectorToLodTensor(
-      std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
-      LoDTensor* score_tensor) const;
-
-  /**
-   * Pack all steps of id/score LodTensor into sentence LoDTensor
-   * it's main logic is:
-   * ```python
-   *   prefix
-   *   result_sentence
-   *   result_lod_tensor
-   *
-   *   for (step in steps):
-   *     prefix = PackTwoSteps(prefix, step, &result_sentence)
-   *   ConvertSentenceVector<T>ToLodTensor(result_sentence, &result_lod_tensor)
-   * ```
-   */
-  void PackAllSteps(const LoDTensorArray& step_ids,
-                    const LoDTensorArray& step_scores, LoDTensor* id_tensor,
-                    LoDTensor* score_tensor) const;
-};
-
-template <typename T>
-Sentence<T> BeamSearchDecoder<T>::MakeSentence(const BeamNode<T>* node) const {
-  Sentence<T> sentence;
-  while (node != nullptr) {
-    sentence.word_ids.emplace_back(node->word_id_);
-    sentence.scores.emplace_back(node->score_);
-    node = node->parent_;
-  }
-
-  std::reverse(std::begin(sentence.word_ids), std::end(sentence.word_ids));
-  std::reverse(std::begin(sentence.scores), std::end(sentence.scores));
-
-  return sentence;
-}
-
-template <typename T>
-std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
-    const LoDTensor& cur_ids, const LoDTensor& cur_scores,
-    std::vector<BeamNodeVector<T>>& prefixes_list,
-    std::vector<SentenceVector<T>>* sentence_vector_list) const {
-  std::vector<BeamNodeVector<T>> result;
-
-  for (size_t src_idx = 0; src_idx < cur_ids.lod()[kSourceLevel].size() - 1;
-       ++src_idx) {
-    size_t src_start = cur_ids.lod().at(kSourceLevel)[src_idx];
-    size_t src_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
-
-    BeamNodeVector<T> beam_nodes;
-
-    // if prefixes size is 0, it means this is the first step. In this step,
-    // all candidate id is the start of candidate sentences.
-    if (prefixes_list.empty()) {
-      PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(),
-                        cur_ids.lod().at(kSentenceLevel).back(),
-                        "in the first step");
-      for (size_t id_idx = src_start; id_idx < src_end; ++id_idx) {
-        beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(new BeamNode<T>(
-            cur_ids.data<int64_t>()[id_idx], cur_scores.data<T>()[id_idx])));
-      }
-    } else {
-      BeamNodeVector<T>& prefixes = prefixes_list[src_idx];
-      SentenceVector<T>& sentence_vector = (*sentence_vector_list)[src_idx];
-
-      PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(),
-                        "prefix and candidate set number should be the same");
-
-      auto candidate_offset = cur_ids.lod()[kSentenceLevel];
-      for (size_t prefix_idx = 0; prefix_idx < prefixes.size(); ++prefix_idx) {
-        std::unique_ptr<BeamNode<T>>& prefix = prefixes[prefix_idx];
-        size_t candidate_start = candidate_offset[src_start + prefix_idx];
-        size_t candidate_end = candidate_offset[src_start + prefix_idx + 1];
-        if (candidate_start == candidate_end) {
-          VLOG(3) << "this sentence has no more candidate, "
-                     "add to result sentence and rm it from beam tree";
-          sentence_vector.push_back(MakeSentence(prefix.get()));
-          prefix.reset();
-        } else {
-          for (size_t candidate_idx = candidate_start;
-               candidate_idx < candidate_end; ++candidate_idx) {
-            auto* candidate =
-                new BeamNode<T>(cur_ids.data<int64_t>()[candidate_idx],
-                                cur_scores.data<T>()[candidate_idx]);
-            candidate->AppendTo(prefix.get());
-            beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(candidate));
-          }
-          prefix.release();
-        }
-      }
-    }
-    result.push_back(std::move(beam_nodes));
-  }
-  return result;
-}
-
-template <typename T>
-void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
-    std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
-    LoDTensor* score_tensor) const {
-  size_t src_num = sentence_vector_list.size();
-
-  PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0");
-
-  std::vector<size_t> source_level_lod = {0};
-  std::vector<size_t> sentence_level_lod = {0};
-  std::vector<int64_t> id_data;
-  std::vector<T> score_data;
-
-  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
-    for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
-      id_data.insert(id_data.end(), sentence.word_ids.begin(),
-                     sentence.word_ids.end());
-      score_data.insert(score_data.end(), sentence.scores.begin(),
-                        sentence.scores.end());
-      sentence_level_lod.push_back(sentence_level_lod.back() +
-                                   sentence.word_ids.size());
-    }
-    source_level_lod.push_back(source_level_lod.back() +
-                               sentence_vector_list[src_idx].size());
-  }
-
-  auto cpu_place = new paddle::platform::CPUPlace();
-  paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);
-
-  framework::LoD lod;
-  lod.push_back(source_level_lod);
-  lod.push_back(sentence_level_lod);
-
-  id_tensor->set_lod(lod);
-  id_tensor->Resize({static_cast<int64_t>(id_data.size())});
-  id_tensor->mutable_data<int64_t>(paddle::platform::CPUPlace());
-  framework::CopyFromVector<int64_t>(id_data, cpu_ctx, id_tensor);
-
-  score_tensor->set_lod(lod);
-  score_tensor->Resize({static_cast<int64_t>(score_data.size())});
-  score_tensor->mutable_data<T>(paddle::platform::CPUPlace());
-  framework::CopyFromVector<T>(score_data, cpu_ctx, score_tensor);
-}
-
-template <typename T>
-void BeamSearchDecoder<T>::PackAllSteps(const LoDTensorArray& step_ids,
-                                        const LoDTensorArray& step_scores,
-                                        LoDTensor* id_tensor,
-                                        LoDTensor* score_tensor) const {
-  PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0");
-  PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(),
-                    "step_ids and step_scores should be the same");
-  const size_t step_num = step_ids.size();
-  const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
-
-  PADDLE_ENFORCE_GT(src_num, 0UL, "source num should be larger than 0");
-
-  // previous prefixes for each step,
-  // the init length is 0, means this is the first step.
-  std::vector<BeamNodeVector<T>> beamnode_vector_list(0);
-  std::vector<SentenceVector<T>> sentence_vector_list(src_num);
-
-  // pack all steps for one batch first, then another batch
-  for (size_t step_id = 0; step_id < step_num; ++step_id) {
-    beamnode_vector_list =
-        PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id),
-                     beamnode_vector_list, &sentence_vector_list);
-  }
-  // append last beam_node to result
-  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
-    for (auto& beam_node : beamnode_vector_list.at(src_idx)) {
-      sentence_vector_list[src_idx].push_back(MakeSentence(beam_node.get()));
-      beam_node.reset();
-    }
-  }
-
-  ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor,
-                                   score_tensor);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/beam_search_decode_op_test.cc b/paddle/operators/beam_search_decode_op_test.cc
deleted file mode 100644
index 5ac23991f3c7768abaf94f3a4b750697de0ef114..0000000000000000000000000000000000000000
--- a/paddle/operators/beam_search_decode_op_test.cc
+++ /dev/null
@@ -1,221 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/beam_search_decode_op.h"
-#include "gtest/gtest.h"
-
-using CPUPlace = paddle::platform::CPUPlace;
-using LoD = paddle::framework::LoD;
-using LoDTensor = paddle::framework::LoDTensor;
-using LoDTensorArray = paddle::framework::LoDTensorArray;
-
-template <typename T>
-using BeamNode = paddle::operators::BeamNode<T>;
-template <typename T>
-using BeamSearchDecoder = paddle::operators::BeamSearchDecoder<T>;
-template <typename T>
-using Sentence = paddle::operators::Sentence<T>;
-template <typename T>
-using BeamNodeVector = paddle::operators::BeamNodeVector<T>;
-template <typename T>
-using SentenceVector = paddle::operators::SentenceVector<T>;
-
-namespace paddle {
-namespace test {
-
-void GenerateExample(const std::vector<size_t>& level_0,
-                     const std::vector<size_t>& level_1,
-                     const std::vector<int>& data, LoDTensorArray* ids,
-                     LoDTensorArray* scores) {
-  PADDLE_ENFORCE_EQ(level_0.back(), level_1.size() - 1,
-                    "source level is used to describe candidate set");
-  PADDLE_ENFORCE_EQ(level_1.back(), data.size(),
-                    "the lowest level is used to describe data"
-                    ", so it's last element should be data length");
-
-  CPUPlace place;
-
-  LoD lod;
-  lod.push_back(level_0);
-  lod.push_back(level_1);
-
-  // Ids
-  LoDTensor tensor_id;
-  tensor_id.set_lod(lod);
-  tensor_id.Resize({static_cast<int64_t>(data.size())});
-  // malloc memory
-  int64_t* id_ptr = tensor_id.mutable_data<int64_t>(place);
-  for (size_t i = 0; i < data.size(); ++i) {
-    id_ptr[i] = static_cast<int64_t>(data.at(i));
-  }
-
-  // Scores
-  LoDTensor tensor_score;
-  tensor_score.set_lod(lod);
-  tensor_score.Resize({static_cast<int64_t>(data.size())});
-  // malloc memory
-  float* score_ptr = tensor_score.mutable_data<float>(place);
-  for (size_t i = 0; i < data.size(); ++i) {
-    score_ptr[i] = static_cast<float>(data.at(i));
-  }
-
-  ids->push_back(tensor_id);
-  scores->push_back(tensor_score);
-}
-
-}  // namespace test
-}  // namespace paddle
-
-TEST(BeamSearchDecodeOp, DeleteBeamNode) {
-  auto* root = new BeamNode<float>(0, 0);
-  auto* b1 = new BeamNode<float>(1, 1);
-  auto* b2 = new BeamNode<float>(2, 2);
-  auto* b3 = new BeamNode<float>(3, 3);
-
-  b1->AppendTo(root);
-  b2->AppendTo(root);
-  b3->AppendTo(b1);
-
-  delete b3;
-  delete b2;
-}
-
-TEST(BeamSearchDecodeOp, MakeSentence) {
-  auto* root = new BeamNode<float>(0, 0);
-  auto* b1 = new BeamNode<float>(1, 1);
-  auto* end = new BeamNode<float>(2, 2);
-  b1->AppendTo(root);
-  end->AppendTo(b1);
-
-  BeamSearchDecoder<float> helper;
-  Sentence<float> sentence = helper.MakeSentence(end);
-  delete end;
-
-  std::vector<int64_t> expect_ids = {0, 1, 2};
-  ASSERT_EQ(sentence.word_ids, expect_ids);
-
-  std::vector<float> expect_scores = {0, 1, 2};
-  ASSERT_EQ(sentence.scores, expect_scores);
-}
-
-TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) {
-  CPUPlace place;
-
-  LoDTensorArray ids;
-  LoDTensorArray scores;
-
-  paddle::test::GenerateExample(
-      std::vector<size_t>{0, 2, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
-      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
-
-  std::vector<BeamNodeVector<float>> beamnode_vector_list;
-  std::vector<SentenceVector<float>> sentence_vector_list(
-      2, SentenceVector<float>());
-
-  BeamSearchDecoder<float> helper;
-  beamnode_vector_list = helper.PackTwoSteps(
-      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
-  ASSERT_EQ(beamnode_vector_list.size(), 2UL);
-  ASSERT_EQ(beamnode_vector_list[0].size(), 2UL);
-  ASSERT_EQ(beamnode_vector_list[1].size(), 4UL);
-}
-
-TEST(BeamSearchDecodeOp, PackTwoSteps) {
-  CPUPlace place;
-
-  // first source has three prefix
-  BeamNodeVector<float> source0_prefixes;
-  source0_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(1, 1)));
-  source0_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(0, 0)));
-  source0_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(3, 3)));
-
-  // second source has two prefix
-  BeamNodeVector<float> source1_prefixes;
-  source1_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(4, 4)));
-  source1_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(5, 5)));
-
-  std::vector<BeamNodeVector<float>> beamnode_vector_list;
-  std::vector<SentenceVector<float>> sentence_vector_list(
-      2, SentenceVector<float>());
-
-  beamnode_vector_list.push_back(std::move(source0_prefixes));
-  beamnode_vector_list.push_back(std::move(source1_prefixes));
-
-  // generate data for one step
-  LoDTensorArray ids;
-  LoDTensorArray scores;
-
-  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 5},
-                                std::vector<size_t>{0, 1, 1, 3, 4, 5},
-                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
-
-  BeamSearchDecoder<float> helper1;
-  beamnode_vector_list = helper1.PackTwoSteps(
-      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
-
-  ASSERT_EQ(sentence_vector_list[0].size(), 1UL);
-  ASSERT_EQ(sentence_vector_list[1].size(), 0UL);
-  ASSERT_EQ(beamnode_vector_list[0].size(), 3UL);
-  ASSERT_EQ(beamnode_vector_list[1].size(), 2UL);
-}
-
-TEST(BeamSearchDecodeOp, PackAllSteps) {
-  CPUPlace place;
-
-  // we will constuct a sample data with 3 steps and 2 source sentences
-  LoDTensorArray ids;
-  LoDTensorArray scores;
-
-  paddle::test::GenerateExample(
-      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
-      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
-  paddle::test::GenerateExample(
-      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 1, 3, 5, 5, 6},
-      std::vector<int>{0, 1, 2, 3, 4, 5}, &ids, &scores);
-  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 6},
-                                std::vector<size_t>{0, 0, 1, 2, 3, 4, 5},
-                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
-
-  ASSERT_EQ(ids.size(), 3UL);
-  ASSERT_EQ(scores.size(), 3UL);
-
-  BeamSearchDecoder<float> helper;
-
-  LoDTensor id_tensor;
-  LoDTensor score_tensor;
-  helper.PackAllSteps(ids, scores, &id_tensor, &score_tensor);
-
-  LoD lod = id_tensor.lod();
-  std::vector<size_t> expect_source_lod = {0, 4, 8};
-  EXPECT_EQ(lod[0], expect_source_lod);
-  std::vector<size_t> expect_sentence_lod = {0, 1, 3, 6, 9, 10, 13, 16, 19};
-  EXPECT_EQ(lod[1], expect_sentence_lod);
-  // 2| 1, 0| 3, 1, 0| 3, 2, 1| 5| 4, 3, 2| 4, 4, 3| 6, 5, 4
-  std::vector<int> expect_data = {2, 1, 0, 3, 1, 0, 3, 2, 1, 5,
-                                  4, 3, 2, 4, 4, 3, 6, 5, 4};
-  ASSERT_EQ(id_tensor.dims()[0], static_cast<int64_t>(expect_data.size()));
-  for (size_t i = 0; i < expect_data.size(); ++i) {
-    ASSERT_EQ(id_tensor.data<int64_t>()[i],
-              static_cast<int64_t>(expect_data[i]));
-  }
-  for (int64_t i = 0; i < id_tensor.dims()[0]; ++i) {
-    ASSERT_EQ(score_tensor.data<float>()[i],
-              static_cast<float>(id_tensor.data<int64_t>()[i]));
-  }
-}
diff --git a/paddle/operators/beam_search_op.cc b/paddle/operators/beam_search_op.cc
deleted file mode 100644
index 844ade40eb2a7ae239b079daa609f03b9e7a06df..0000000000000000000000000000000000000000
--- a/paddle/operators/beam_search_op.cc
+++ /dev/null
@@ -1,258 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/beam_search_op.h"
-
-#include <map>
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
-                            framework::LoDTensor *selected_ids,
-                            framework::LoDTensor *selected_scores) {
-  auto abs_lod = framework::ToAbsOffset(ids_->lod());
-  auto &high_level = abs_lod[lod_level_];
-
-  auto items = SelectTopBeamSizeItems();
-  auto selected_items = ToMap(items, high_level.back());
-  VLOG(3) << "selected_items:";
-  for (size_t i = 0; i < selected_items.size(); ++i) {
-    VLOG(3) << "offset:" << i;
-    for (auto &item : selected_items[i]) {
-      VLOG(3) << ItemToString(item);
-    }
-  }
-  PruneEndidCandidates(pre_ids, &selected_items);
-  // calculate the output tensor's height
-  size_t num_instances = std::accumulate(
-      std::begin(selected_items), std::end(selected_items), 0,
-      [](size_t a, std::vector<Item> &b) { return a + b.size(); });
-  // the output tensor shape should be [num_instances, 1]
-  auto dims = framework::make_ddim(
-      std::vector<int64_t>({static_cast<int>(num_instances), 1}));
-  selected_ids->Resize(dims);
-  selected_scores->Resize(dims);
-
-  std::map<size_t /*offset*/, std::vector<Item>> hash;
-  framework::LoD new_lod;
-  auto *ids_data = selected_ids->mutable_data<int64_t>(platform::CPUPlace());
-  auto *scores_data =
-      selected_scores->mutable_data<float>(platform::CPUPlace());
-
-  // fill in data
-  std::vector<size_t> low_level;
-  size_t low_offset = 0;
-  for (auto &items : selected_items) {
-    low_level.push_back(low_offset);
-    sort(items.begin(), items.end(), [](const Item &a, const Item &b) {
-      if (a.offset < b.offset) {
-        return true;
-      }
-      return a.id < b.id;
-    });
-    for (auto &item : items) {
-      ids_data[low_offset] = item.id;
-      scores_data[low_offset] = item.score;
-      low_offset++;
-    }
-  }
-  low_level.push_back(low_offset);
-
-  // fill lod
-  framework::LoD lod(2);
-  lod[0].assign(high_level.begin(), high_level.end());
-  lod[1].assign(low_level.begin(), low_level.end());
-  if (!framework::CheckLoD(lod)) {
-    PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
-  }
-  selected_ids->set_lod(lod);
-  selected_scores->set_lod(lod);
-}
-
-int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
-                                     std::vector<std::vector<Item>> *items) {
-  auto *pre_ids_data = pre_ids.data<int64_t>();
-
-  int res = 0;
-  for (size_t offset = 0; offset < items->size(); offset++) {
-    auto prefix_id = pre_ids_data[offset];
-    if (prefix_id == end_id_) {
-      items->at(offset).clear();
-    } else {
-      res++;
-    }
-  }
-
-  return res;
-}
-
-std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
-    const std::vector<std::vector<Item>> &items, size_t element_num) {
-  std::vector<std::vector<Item>> result;
-  result.resize(element_num);
-  for (auto &entries : items) {
-    for (const auto &item : entries) {
-      result[item.offset].push_back(item);
-    }
-  }
-  return result;
-}
-
-std::vector<std::vector<BeamSearch::Item>>
-BeamSearch::SelectTopBeamSizeItems() {
-  std::vector<std::vector<Item>> result;
-  std::vector<Item> items;
-  // for each source sentence, select the top beam_size items across all
-  // candidate sets.
-  while (NextItemSet(&items)) {
-    std::nth_element(std::begin(items), std::begin(items) + beam_size_,
-                     std::end(items), [](const Item &a, const Item &b) {
-                       // TODO(superjom) make score's comparation customizable.
-                       // partial sort in descending order
-                       return a.score > b.score;
-                     });
-    // prune the top beam_size items.
-    if (items.size() > beam_size_) {
-      items.resize(beam_size_);
-    }
-    result.emplace_back(items);
-  }
-  VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
-  for (auto &items : result) {
-    VLOG(3) << "item set:";
-    for (auto &item : items) {
-      VLOG(3) << ItemToString(item);
-    }
-  }
-
-  return result;
-}
-
-// the candidates of a source
-bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
-  if (sent_offset_ >= ids_->NumElements(lod_level_)) {
-    return false;
-  }
-  // find the current candidates
-  auto ids = *ids_;
-  auto scores = *scores_;
-
-  auto abs_lod = framework::ToAbsOffset(ids.lod());
-
-  auto *ids_data = ids.data<int64_t>();
-  auto *scores_data = scores.data<float>();
-
-  size_t instance_dim = 1;
-  for (int i = 1; i < ids.dims().size(); i++) {
-    instance_dim *= ids.dims()[i];
-  }
-
-  items->clear();
-  items->reserve(framework::product(ids.dims()));
-  for (size_t offset = abs_lod[lod_level_][sent_offset_];
-       offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
-    for (size_t d = 0; d < instance_dim; d++) {
-      const size_t dim_offset = offset * instance_dim + d;
-      items->emplace_back(offset, ids_data[dim_offset],
-                          scores_data[dim_offset]);
-    }
-  }
-
-  sent_offset_++;
-  return true;
-}
-
-std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) {
-  os << "{";
-  os << "offset: " << item.offset << ", ";
-  os << "id: " << item.id << ", ";
-  os << "score: " << item.score << "";
-  os << "}";
-
-  return os;
-}
-
-std::string ItemToString(const BeamSearch::Item &item) {
-  std::ostringstream stream;
-  stream << item;
-  return stream.str();
-}
-
-class BeamSearchProtoAndCheckerMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  BeamSearchProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    // inputs and outputs stored in proto
-    AddInput("pre_ids", "ids in previous step");
-    AddInput("ids", "a LoDTensor of shape of [None,k]");
-    AddInput("scores",
-             "a LoDTensor that has the same shape and LoD with `ids`");
-    AddOutput("selected_ids",
-              "a LoDTensor that stores the IDs selected by beam search");
-    AddOutput(
-        "selected_scores",
-        "a LoDTensor that has the same shape and LoD with `selected_ids`");
-
-    // Attributes stored in AttributeMap
-    AddAttr<int>("level", "the level of LoDTensor");
-    AddAttr<int>("beam_size", "beam size for beam search");
-    AddAttr<int>("end_id",
-                 "the token id which indicates the end of a sequence");
-
-    AddComment(
-        "This is a beam search operator that help to generate sequences.");
-  }
-};
-
-class BeamSearchInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    for (const std::string &arg :
-         std::vector<std::string>({"pre_ids", "ids", "scores"})) {
-      PADDLE_ENFORCE(context->HasInput(arg),
-                     "BeamSearch need input argument '%s'", arg);
-    }
-    for (const std::string &arg :
-         std::vector<std::string>({"selected_ids", "selected_scores"})) {
-      PADDLE_ENFORCE(context->HasOutput(arg),
-                     "BeamSearch need output argument '%s'", arg);
-    }
-  }
-};
-
-class BeamSearchInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &o : op_desc.Output("selected_ids")) {
-      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
-    }
-    for (auto &o : op_desc.Output("selected_scores")) {
-      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(beam_search, paddle::operators::BeamSearchOp,
-                  paddle::operators::BeamSearchProtoAndCheckerMaker,
-                  paddle::operators::BeamSearchInferShape,
-                  paddle::operators::BeamSearchInferVarType,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/beam_search_op.h b/paddle/operators/beam_search_op.h
deleted file mode 100644
index 7ad85874fcbd6ea48d688b32f2cc982d6b76d3c4..0000000000000000000000000000000000000000
--- a/paddle/operators/beam_search_op.h
+++ /dev/null
@@ -1,237 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_TESTING
-#include "gtest/gtest.h"
-#endif
-
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-/*
- * This is an implementation of beam search.
- *
- * To explain the details, lets take machine translation task for example, in
- * this task, one source sentence is translated to multiple target sentences,
- * during this period, one sentence will be translated to multiple translation
- * prefixes(target sentence that have not ended), in each time step a prefix
- * will have some candidates, input the candidate ids and their corresponding
- * scores (probabilities), it will sort and select the top beam_size candidates
- * for each source sentence, and store the selected candidates's score and their
- * corresponding ids to LoDTensors.
- *
- * A detailed example:
- *
- * Input
- *
- * ids:
- * LoD (should have 2 levels)
- * first level: [0, 1, 4]
- * second level: [0, 1, 2, 3, 4]
- *
- * tensor's data
- * [
- * [4, 2, 5]
- * [2, 1, 3]
- * [3, 5, 2]
- * [8, 2, 1]
- * ]
- *
- * scores:
- * LoD same as `ids`
- * tensor's data
- * [
- * [0.5, 0.3, 0.2]
- * [0.6, 0.3, 0.1]
- * [0.9, 0.5, 0.1]
- * [0.7, 0.5, 0.1]
- * ]
- *
- * the inputs means that there are 2 source sentences to translate, and the
- * first source has 1 prefix, the second source has 2 prefix.
- *
- * lets assume beam size is 2, and the beam search's output should be
- * LoD
- * first level:
- * [0, 1, 2]
- * second level:
- * [0, 2, 4]
- *
- * id tensor's data
- * [[
- * 4,
- * 1,
- * 3,
- * 8,
- * ]]
- *
- * score tensor's data
- * [[
- * 0.5,
- * 0.3,
- * 0.9,
- * 0.7
- * ]]
- *
- * TODO all the prune operations should be in the beam search, so it is better
- * to split the beam search algorithm into a sequence of smaller operators, and
- * the prune operators can be inserted in this sequence.
- */
-class BeamSearch {
- public:
-  // TODO(superjom) make type customizable
-  using id_t = size_t;
-  using score_t = float;
-  /*
-   * Input the arguments that needed by this class.
-   */
-  BeamSearch(const framework::LoDTensor& ids,
-             const framework::LoDTensor& scores, size_t level, size_t beam_size,
-             int end_id)
-      : beam_size_(beam_size),
-        ids_(&ids),
-        scores_(&scores),
-        lod_level_(level),
-        end_id_(end_id) {}
-
-  /*
-   * The main function of beam search.
-   *
-   * @selected_ids: a [None, 1]-shaped tensor with LoD.
-   *   In a machine translation model, it might be the candidate term id sets,
-   *   each set stored as a varience-length sequence.
-   *   The format might be described with a two-level LoD
-   *   - [[0 1]
-   *   -  [0 1 2]]
-   *   - [[]
-   *   -  [0 1]]
-   *   the first level of LoD tells that there are two source sentences. The
-   *   second level describes the details of the candidate id set's offsets in
-   * the
-   *   source sentences.
-   *
-   *  @selected_scores: a LoD tensor with the same shape and LoD with
-   * selected_ids.
-   *   It stores the corresponding scores of candidate ids in selected_ids.
-   *
-   * Return false if all the input tensor is empty, in machine translation task
-   * that means no candidates is provided, and the task will stop running.
-   */
-  void operator()(const framework::LoDTensor& pre_ids,
-                  framework::LoDTensor* selected_ids,
-                  framework::LoDTensor* selected_scores);
-  /*
-   * The basic items help to sort.
-   */
-  struct Item {
-    Item() {}
-    Item(size_t offset, size_t id, float score)
-        : offset(offset), id(id), score(score) {}
-    // offset in the higher lod level.
-    size_t offset;
-    // // prefix id in the lower lod level.
-    // size_t prefix;
-    // the candidate id
-    id_t id;
-    // the corresponding score
-    score_t score;
-  };
-
- protected:
-  /*
-   * Delete all the records that follows the end token.
-   */
-  int PruneEndidCandidates(const framework::LoDTensor& pre_ids,
-                           std::vector<std::vector<Item>>* items);
-
-  /*
-   * Transform the items into a map whose key is offset, value is the items.
-   * NOTE low performance
-   */
-  std::vector<std::vector<Item>> ToMap(
-      const std::vector<std::vector<Item>>& inputs, size_t element_num);
-
-  /*
-   * For each source, select top beam_size records.
-   */
-  std::vector<std::vector<Item>> SelectTopBeamSizeItems();
-
-  /*
-   * Get the items of next source sequence, return false if no remaining items.
-   */
-  bool NextItemSet(std::vector<Item>* items);
-
- private:
-  size_t beam_size_;
-  const framework::LoDTensor* ids_;
-  const framework::LoDTensor* scores_;
-  size_t lod_level_{0};
-  size_t sent_offset_{0};
-  int end_id_{0};
-};
-
-std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item);
-
-std::string ItemToString(const BeamSearch::Item& item);
-
-class BeamSearchOp : public framework::OperatorBase {
- public:
-  BeamSearchOp(const std::string& type,
-               const framework::VariableNameMap& inputs,
-               const framework::VariableNameMap& outputs,
-               const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  BeamSearchOp(const BeamSearchOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    PADDLE_THROW("Not Implemented");
-  }
-
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
-    auto ids_var = scope.FindVar(Input("ids"));
-    auto scores_var = scope.FindVar(Input("scores"));
-    auto pre_ids_var = scope.FindVar(Input("pre_ids"));
-    PADDLE_ENFORCE_NOT_NULL(ids_var);
-    PADDLE_ENFORCE_NOT_NULL(scores_var);
-    PADDLE_ENFORCE_NOT_NULL(pre_ids_var);
-
-    auto& ids = ids_var->Get<framework::LoDTensor>();
-    auto& scores = scores_var->Get<framework::LoDTensor>();
-    auto& pre_ids = pre_ids_var->Get<framework::LoDTensor>();
-    size_t level = Attr<int>("level");
-    size_t beam_size = Attr<int>("beam_size");
-    int end_id = Attr<int>("end_id");
-    BeamSearch alg(ids, scores, level, beam_size, end_id);
-
-    auto selected_ids_var = scope.FindVar(Output("selected_ids"));
-    auto selected_scores_var = scope.FindVar(Output("selected_scores"));
-    PADDLE_ENFORCE_NOT_NULL(selected_ids_var);
-    PADDLE_ENFORCE_NOT_NULL(selected_scores_var);
-    auto& selected_ids_tensor =
-        *selected_ids_var->GetMutable<framework::LoDTensor>();
-    auto& selected_scores_tensor =
-        *selected_scores_var->GetMutable<framework::LoDTensor>();
-    alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/beam_search_op_test.cc b/paddle/operators/beam_search_op_test.cc
deleted file mode 100644
index d4beb64a85a1645b3fed22c3325bd8c0b7cd12b1..0000000000000000000000000000000000000000
--- a/paddle/operators/beam_search_op_test.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/beam_search_op.h"
-
-#include <gtest/gtest.h>
-#include <vector>
-
-namespace paddle {
-namespace test {
-
-using std::vector;
-using framework::LoDTensor;
-using framework::LoD;
-using operators::BeamSearch;
-using paddle::platform::CPUPlace;
-using std::cout;
-using std::endl;
-
-void CreateInput(LoDTensor* ids, LoDTensor* scores) {
-  LoD lod;
-  vector<size_t> level0({0, 1, 4});
-  vector<size_t> level1({0, 1, 2, 3, 4});
-  lod.push_back(level0);
-  lod.push_back(level1);
-  ids->set_lod(lod);
-  scores->set_lod(lod);
-
-  auto dims = framework::make_ddim(vector<int64_t>({4, 3}));
-  ids->Resize(dims);
-  scores->Resize(dims);
-  CPUPlace place;
-
-  auto* ids_data = ids->mutable_data<int64_t>(place);
-  auto* scores_data = scores->mutable_data<float>(place);
-  vector<int64_t> _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
-  vector<float> _scores(
-      {0.5, 0.3, 0.2, 0.6, 0.3, 0.1, 0.9, 0.5, 0.1, 0.7, 0.5, 0.1});
-
-  for (int i = 0; i < 12; i++) {
-    ids_data[i] = _ids[i];
-    scores_data[i] = _scores[i];
-  }
-}
-
-TEST(beam_search_op, run) {
-  CPUPlace place;
-  LoDTensor ids, scores;
-  CreateInput(&ids, &scores);
-
-  LoDTensor pre_ids;
-  pre_ids.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
-  for (int i = 0; i < 4; i++) {
-    pre_ids.mutable_data<int64_t>(place)[i] = i + 1;
-  }
-
-  BeamSearch beamsearch(ids, scores, (int64_t)0, (int64_t)2, 0);
-  LoDTensor sids, sscores;
-  beamsearch(pre_ids, &sids, &sscores);
-
-  LOG(INFO) << "score: " << sscores << endl;
-
-  ASSERT_EQ(sids.lod(), sscores.lod());
-
-  vector<int> tids({2, 4, 3, 8});
-  vector<float> tscores({0.3, 0.5, 0.9, 0.7});
-
-  for (int i = 0; i < 4; i++) {
-    ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);
-    ASSERT_EQ(tscores[i], sscores.data<float>()[i]);
-  }
-}
-
-}  // namespace test
-}  // namespace paddle
diff --git a/paddle/operators/bilinear_tensor_product_op.cc b/paddle/operators/bilinear_tensor_product_op.cc
deleted file mode 100644
index 7640147a12d66a924f16eaf168227b6ce6a96040..0000000000000000000000000000000000000000
--- a/paddle/operators/bilinear_tensor_product_op.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/bilinear_tensor_product_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class BilinearTensorProductOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(Weight) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    auto weight_dims = ctx->GetInputDim("Weight");
-
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The input(X) must be a 2D Tensor.");
-    PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The input(Y) must be a 2D Tensor.");
-    PADDLE_ENFORCE_EQ(weight_dims.size(), 3UL,
-                      "The input(Weight) must be a 3D tensor.");
-    PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
-                      "The first dimension(batch_size) of input(X) must be "
-                      "equal to the first dimension of the input(Y).");
-    PADDLE_ENFORCE_EQ(x_dims[1], weight_dims[1],
-                      "The second dimension of input(X) must be equal to "
-                      "the second dimension of the input(Weight).");
-    PADDLE_ENFORCE_EQ(y_dims[1], weight_dims[2],
-                      "The second dimension of input(Y) must be equal to "
-                      "the third dimension of the input(Weight).");
-
-    if (ctx->HasInput("Bias")) {
-      auto bias_dims = ctx->GetInputDim("Bias");
-      PADDLE_ENFORCE(bias_dims.size() == 2UL && bias_dims[0] == 1UL,
-                     "The Input(Bias) must be a 2-D tensor with "
-                     "the 2nd dimension fixed to 1 (a row vector).");
-      PADDLE_ENFORCE_EQ(bias_dims[1], weight_dims[0],
-                        "The second dimension of input(Bias) must be equal "
-                        "to the first dimension of the input(Weight).");
-    }
-
-    ctx->SetOutputDim("Out", {x_dims[0], weight_dims[0]});
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  BilinearTensorProductOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of bilinear_tensor_product operator.");
-    AddInput("Y", "The second input of bilinear_tensor_product operator.");
-    AddInput("Weight",
-             "The learnable parameters of bilinear_tensor_product operator.");
-    AddInput("Bias", "The learnable bias of bilinear_tensor_product operator.")
-        .AsDispensable();
-    AddOutput("Out", "The output of bilinear_tensor_product operator.");
-    AddComment(R"DOC(
-Bilinear Tensor Product operator.
-Given input X and Y, a 3D tensor Weight and a Bias. Each column of the
-Output is computed by one slice $i = 1, . . . , k$ of the tensor:
-
-$$
-M =  (X W_i) * Y \\
-Out_i = \sum_j {M_j} + Bias_i
-$$
-
-Where $W_i$ is the $i$-th slice of Input(Weight);
-      $M_j$ is the $j$-th column of $M$;
-      $Out_i$ is the $i$-th column of Output(Out);
-      $Bias_i$ is a column vector, each element of it is equal to
-        the $i$-th element of $Bias$;
-
-)DOC");
-  }
-};
-
-class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(Weight) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    auto weight_dims = ctx->GetInputDim("Weight");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_EQ(out_dims.size(), 2UL,
-                      "The input(Out@GRAD) must be a 2D Tensor.");
-    PADDLE_ENFORCE_EQ(
-        x_dims[0], out_dims[0],
-        "The first dimension(batch_size) of input(Out@GRAD) must be "
-        "equal to the first dimension of the Input(X).");
-    PADDLE_ENFORCE_EQ(
-        weight_dims[0], out_dims[1],
-        "The second dimension of input(Out@GRAD) must be equal to "
-        "the third dimension of the Input(Weight).");
-
-    if (ctx->HasInput("Bias")) {
-      auto bias_dims = ctx->GetInputDim("Bias");
-      PADDLE_ENFORCE_EQ(
-          bias_dims[1], out_dims[1],
-          "The second dimension of input(Out@GRAD) must be equal to "
-          "the second dimension of the Input(Bias).");
-      auto bias_grad_name = framework::GradVarName("Bias");
-      if (ctx->HasOutput(bias_grad_name))
-        ctx->SetOutputDim(bias_grad_name, bias_dims);
-    }
-
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    auto weight_grad_name = framework::GradVarName("Weight");
-
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, y_dims);
-    }
-    if (ctx->HasOutput(weight_grad_name)) {
-      ctx->SetOutputDim(weight_grad_name, weight_dims);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp,
-            ops::BilinearTensorProductOpMaker, bilinear_tensor_product_grad,
-            ops::BilinearTensorProductOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    bilinear_tensor_product,
-    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext,
-                                     double>);
-REGISTER_OP_CPU_KERNEL(
-    bilinear_tensor_product_grad,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CPUDeviceContext,
-                                         float>,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CPUDeviceContext,
-                                         double>);
diff --git a/paddle/operators/bilinear_tensor_product_op.cu b/paddle/operators/bilinear_tensor_product_op.cu
deleted file mode 100644
index 0f48010716f086a64c0b6a35b76e06a42430ab84..0000000000000000000000000000000000000000
--- a/paddle/operators/bilinear_tensor_product_op.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/bilinear_tensor_product_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    bilinear_tensor_product,
-    ops::BilinearTensorProductKernel<paddle::platform::CUDADeviceContext,
-                                     float>,
-    ops::BilinearTensorProductKernel<paddle::platform::CUDADeviceContext,
-                                     double>);
-REGISTER_OP_CUDA_KERNEL(
-    bilinear_tensor_product_grad,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CUDADeviceContext,
-                                         float>,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CUDADeviceContext,
-                                         double>);
diff --git a/paddle/operators/bilinear_tensor_product_op.h b/paddle/operators/bilinear_tensor_product_op.h
deleted file mode 100644
index ba9a2c5ce3c024a82e864a399ad90281d8dcdb20..0000000000000000000000000000000000000000
--- a/paddle/operators/bilinear_tensor_product_op.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class BilinearTensorProductKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto y_mat = EigenMatrix<T>::From(*y);
-    auto output_mat = EigenMatrix<T>::From(*out);
-
-    auto batch_size = x->dims()[0];
-    auto weight_dims = weight->dims();
-    int out_dim = weight_dims[0];
-    auto x_dim = weight_dims[1];
-    auto y_dim = weight_dims[2];
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    // Create the intermediate variable to caculate the result of
-    // Input(X) multiplied by Input(Weight_i), the formula is:
-    // left_mul = X Weight_i.
-    Tensor left_mul;
-    left_mul.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
-                             ctx.GetPlace());
-    auto left_mul_mat = EigenMatrix<T>::From(left_mul);
-
-    for (int i = 0; i < out_dim; ++i) {
-      auto output_col_vec = output_mat.chip(i, 1);
-      Tensor weight_mat =
-          weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
-      math::gemm<DeviceContext, T>(dev_ctx, CblasNoTrans, CblasNoTrans,
-                                   batch_size, y_dim, x_dim, 1, x->data<T>(),
-                                   weight_mat.data<T>(), 0, left_mul.data<T>());
-      output_col_vec.device(place) =
-          (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
-    }
-    if (bias) {
-      auto bias_vec = EigenMatrix<T>::From(*bias);
-      Eigen::DSizes<int, 2> bcast(batch_size, 1);
-      output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* y = ctx.Input<Tensor>("Y");
-    const Tensor* weight = ctx.Input<Tensor>("Weight");
-    Tensor* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    Tensor* d_y = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    Tensor* d_weight = ctx.Output<Tensor>(framework::GradVarName("Weight"));
-    Tensor* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-    const Tensor* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    auto batch_size = x->dims()[0];
-    auto weight_dims = weight->dims();
-    int out_dim = weight_dims[0];
-    auto x_dim = weight_dims[1];
-    auto y_dim = weight_dims[2];
-
-    auto x_mat = EigenMatrix<T>::From(*x);
-    auto y_mat = EigenMatrix<T>::From(*y);
-    auto d_out_mat = EigenMatrix<T>::From(*d_out);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    // Create the intermediate variable to caculate the Output(Y@Grad).
-    Tensor x_scale;
-    x_scale.mutable_data<T>(framework::make_ddim({batch_size, x_dim}),
-                            ctx.GetPlace());
-    auto x_scale_mat = EigenMatrix<T>::From(x_scale);
-
-    // Create the intermediate variable to caculate the Output(X@Grad).
-    Tensor y_scale;
-    y_scale.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
-                            ctx.GetPlace());
-    auto y_scale_mat = EigenMatrix<T>::From(y_scale);
-
-    math::SetConstant<DeviceContext, T> set_zero;
-
-    // Set Output(X@Grad) be zero.
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_x, static_cast<T>(0));
-    }
-
-    // Set Output(Y@Grad) be zero.
-    if (d_y) {
-      d_y->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_y, static_cast<T>(0));
-    }
-
-    // Caculate the Output(X@Grad) and Output(Y@Grad).
-    if (d_x || d_y) {
-      Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
-      Eigen::DSizes<int, 2> bcast_for_y(1, x_dim);
-      for (int i = 0; i < out_dim; ++i) {
-        Tensor weight_i = weight->Slice(i, i + 1).Resize(
-            framework::make_ddim({x_dim, y_dim}));
-        auto output_vec = d_out_mat.chip(i, 1);
-        if (d_x) {
-          y_scale_mat.device(place) =
-              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
-                  .broadcast(bcast_for_x) *
-              y_mat;
-          math::gemm<DeviceContext, T>(
-              dev_ctx, CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
-              y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
-        }
-        if (d_y) {
-          x_scale_mat.device(place) =
-              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
-                  .broadcast(bcast_for_y) *
-              x_mat;
-          math::gemm<DeviceContext, T>(
-              dev_ctx, CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
-              x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
-        }
-      }
-    }
-
-    // Caculate the gradient of Input(Weight).
-    if (d_weight) {
-      d_weight->mutable_data<T>(ctx.GetPlace());
-      Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
-      for (int i = 0; i < out_dim; ++i) {
-        Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize(
-            framework::make_ddim({x_dim, y_dim}));
-        auto output_vec = d_out_mat.chip(i, 1);
-        x_scale_mat.device(place) =
-            output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
-                .broadcast(bcast_for_weight) *
-            x_mat;
-        math::gemm<DeviceContext, T>(dev_ctx, CblasTrans, CblasNoTrans, x_dim,
-                                     y_dim, batch_size, 1, x_scale.data<T>(),
-                                     y->data<T>(), 0, d_weight_i.data<T>());
-      }
-    }
-
-    // Caculate the gradient of Input(Bias).
-    if (d_bias) {
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      auto d_bias_mat = framework::EigenVector<T>::Flatten(*d_bias);
-      d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes<int, 1>(0));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/bipartite_match_op.cc b/paddle/operators/bipartite_match_op.cc
deleted file mode 100644
index 1e6fa2091de25218e2bdafeb740ce884234638a5..0000000000000000000000000000000000000000
--- a/paddle/operators/bipartite_match_op.cc
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-class BipartiteMatchOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("DistMat"),
-                   "Input(DistMat) of BipartiteMatch should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("ColToRowMatchIndices"),
-        "Output(ColToRowMatchIndices) of BipartiteMatch should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("ColToRowMatchDist"),
-        "Output(ColToRowMatchDist) of BipartiteMatch should not be null.");
-
-    auto dims = ctx->GetInputDim("DistMat");
-    PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2.");
-
-    ctx->SetOutputDim("ColToRowMatchIndices", dims);
-    ctx->SetOutputDim("ColToRowMatchDist", dims);
-  }
-};
-
-template <typename T>
-class BipartiteMatchKernel : public framework::OpKernel<T> {
- public:
-  // The match_indices must be initialized to -1 at first.
-  // The match_dist must be initialized to 0 at first.
-  void BipartiteMatch(const Tensor& dist, int* match_indices,
-                      T* match_dist) const {
-    constexpr T kEPS = static_cast<T>(1e-6);
-    PADDLE_ENFORCE_EQ(dist.dims().size(), 2, "The rank of dist must be 2.");
-    int64_t row = dist.dims()[0];
-    int64_t col = dist.dims()[1];
-    auto* dist_data = dist.data<T>();
-    std::vector<int> row_pool;
-    for (int i = 0; i < row; ++i) {
-      row_pool.push_back(i);
-    }
-    while (row_pool.size() > 0) {
-      int max_idx = -1;
-      int max_row_idx = -1;
-      T max_dist = -1;
-      for (int64_t j = 0; j < col; ++j) {
-        if (match_indices[j] != -1) {
-          continue;
-        }
-        for (size_t k = 0; k < row_pool.size(); ++k) {
-          int m = row_pool[k];
-          // distance is 0 between m-th row and j-th column
-          if (dist_data[m * col + j] < kEPS) {
-            continue;
-          }
-          if (dist_data[m * col + j] > max_dist) {
-            max_idx = j;
-            max_row_idx = m;
-            max_dist = dist_data[m * col + j];
-          }
-        }
-      }
-      if (max_idx == -1) {
-        // Cannot find good match.
-        break;
-      } else {
-        PADDLE_ENFORCE_EQ(match_indices[max_idx], -1);
-        match_indices[max_idx] = max_row_idx;
-        match_dist[max_idx] = max_dist;
-        // Erase the row index.
-        row_pool.erase(
-            std::find(row_pool.begin(), row_pool.end(), max_row_idx));
-      }
-    }
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* dist_mat = context.Input<LoDTensor>("DistMat");
-    auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
-    auto* match_dist = context.Output<Tensor>("ColToRowMatchDist");
-
-    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
-
-    auto col = dist_mat->dims()[1];
-
-    int64_t n = dist_mat->lod().size() == 0UL
-                    ? 1
-                    : static_cast<int64_t>(dist_mat->lod().back().size() - 1);
-    if (dist_mat->lod().size()) {
-      PADDLE_ENFORCE_EQ(dist_mat->lod().size(), 1UL,
-                        "Only support 1 level of LoD.");
-    }
-    match_indices->mutable_data<int>({n, col}, context.GetPlace());
-    match_dist->mutable_data<T>({n, col}, context.GetPlace());
-
-    math::SetConstant<platform::CPUDeviceContext, int> iset;
-    iset(dev_ctx, match_indices, static_cast<int>(-1));
-    math::SetConstant<platform::CPUDeviceContext, T> tset;
-    tset(dev_ctx, match_dist, static_cast<T>(0));
-
-    int* indices = match_indices->data<int>();
-    T* dist = match_dist->data<T>();
-    if (n == 1) {
-      BipartiteMatch(*dist_mat, indices, dist);
-    } else {
-      auto lod = dist_mat->lod().back();
-      for (size_t i = 0; i < lod.size() - 1; ++i) {
-        Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
-        BipartiteMatch(one_ins, indices + i * col, dist + i * col);
-      }
-    }
-  }
-};
-
-class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  BipartiteMatchOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "DistMat",
-        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
-        "[K, M]. It is pair-wise distance matrix between the entities "
-        "represented by each row and each column. For example, assumed one "
-        "entity is A with shape [K], another entity is B with shape [M]. The "
-        "DistMat[i][j] is the distance between A[i] and B[j]. The bigger "
-        "the distance is, the better macthing the pairs are. Please note, "
-        "This tensor can contain LoD information to represent a batch of "
-        "inputs. One instance of this batch can contain different numbers of "
-        "entities.");
-    AddOutput("ColToRowMatchIndices",
-              "(Tensor) A 2-D Tensor with shape [N, M] in int type. "
-              "N is the batch size. If ColToRowMatchIndices[i][j] is -1, it "
-              "means B[j] does not match any entity in i-th instance. "
-              "Otherwise, it means B[j] is matched to row "
-              "ColToRowMatchIndices[i][j] in i-th instance. The row number of "
-              "i-th instance is saved in ColToRowMatchIndices[i][j].");
-    AddOutput("ColToRowMatchDist",
-              "(Tensor) A 2-D Tensor with shape [N, M] in float type. "
-              "N is batch size. If ColToRowMatchIndices[i][j] is -1, "
-              "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed "
-              "ColToRowMatchIndices[i][j] = d, and the row offsets of each "
-              "instance are called LoD. Then "
-              "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]");
-    AddComment(R"DOC(
-This operator is a greedy bipartite matching algorithm, which is used to
-obtain the matching with the maximum distance based on the input
-distance matrix. For input 2D matrix, the bipartite matching algorithm can
-find the matched column for each row, also can find the matched row for
-each column. And this operator only calculate matched indices from column
-to row. For each instance, the number of matched indices is the number of
-of columns of the input ditance matrix.
-
-There are two outputs to save matched indices and distance.
-A simple description, this algothrim matched the best (maximum distance)
-row entity to the column entity and the matched indices are not duplicated
-in each row of ColToRowMatchIndices. If the column entity is not matched
-any row entity, set -1 in ColToRowMatchIndices.
-
-Please note that the input DistMat can be LoDTensor (with LoD) or Tensor.
-If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
-If Tensor, the height of ColToRowMatchIndices is 1.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(bipartite_match, ops::BipartiteMatchOp,
-                  ops::BipartiteMatchOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(bipartite_match, ops::BipartiteMatchKernel<float>,
-                       ops::BipartiteMatchKernel<double>);
diff --git a/paddle/operators/box_coder_op.cc b/paddle/operators/box_coder_op.cc
deleted file mode 100644
index 539813d4858b8faef386047f9ef64aa232aefca1..0000000000000000000000000000000000000000
--- a/paddle/operators/box_coder_op.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/box_coder_op.h"
-
-namespace paddle {
-namespace operators {
-
-class BoxCoderOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
-                   "Input(PriorBox) of BoxCoderOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"),
-                   "Input(PriorBoxVar) of BoxCoderOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("TargetBox"),
-                   "Input(TargetBox) of BoxCoderOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("OutputBox"),
-                   "Output(OutputBox) of BoxCoderOp should not be null.");
-
-    auto prior_box_dims = ctx->GetInputDim("PriorBox");
-    auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
-    auto target_box_dims = ctx->GetInputDim("TargetBox");
-
-    PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
-                      "The rank of Input of PriorBoxVar must be 2");
-    PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
-    PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
-    PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
-                      "The rank of Input of TargetBox must be 2");
-    PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
-                      "The shape of TargetBox is [M, 4]");
-
-    GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
-
-    ctx->SetOutputDim(
-        "OutputBox",
-        framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
-    ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
-  }
-};
-
-class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "PriorBox",
-        "(Tensor, default Tensor<float>) "
-        "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, "
-        "each box is represented as [xmin, ymin, xmax, ymax], "
-        "[xmin, ymin] is the left top coordinate of the anchor box, "
-        "if the input is image feature map, they are close to the origin "
-        "of the coordinate system. [xmax, ymax] is the right bottom "
-        "coordinate of the anchor box.");
-    AddInput("PriorBoxVar",
-             "(Tensor, default Tensor<float>) "
-             "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group "
-             "of variance.");
-    AddInput(
-        "TargetBox",
-        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
-        "[N, 4], each box is represented as [xmin, ymin, xmax, ymax], "
-        "[xmin, ymin] is the left top coordinate of the box if the input "
-        "is image feature map, they are close to the origin of the coordinate "
-        "system. [xmax, ymax] is the right bottom coordinate of the box. "
-        "This tensor can contain LoD information to represent a batch "
-        "of inputs. One instance of this batch can contain different "
-        "numbers of entities.");
-    AddAttr<std::string>("code_type",
-                         "(string, default encode_center_size) "
-                         "the code type used with the target box")
-        .SetDefault("encode_center_size")
-        .InEnum({"encode_center_size", "decode_center_size"});
-    AddOutput(
-        "OutputBox",
-        "(LoDTensor or Tensor) "
-        "(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] "
-        "representing the result of N target boxes encoded/decoded with "
-        "M Prior boxes and variances.");
-
-    AddComment(R"DOC(
-Bounding Box Coder Operator.
-Encode/Decode the target bounding box with the priorbox information.
-The Encoding schema described below:
-ox = (tx - px) / pw / pxv
-oy = (ty - py) / ph / pyv
-ow = log(abs(tw / pw)) / pwv 
-oh = log(abs(th / ph)) / phv 
-The Decoding schema described below:
-ox = (pw * pxv * tx * + px) - tw / 2
-oy = (ph * pyv * ty * + py) - th / 2
-ow = exp(pwv * tw) * pw + tw / 2
-oh = exp(phv * th) * ph + th / 2
-where tx, ty, tw, th denote the target box's center coordinates, width and
-height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
-center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
-of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
-width and height.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker);
-REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>,
-                       ops::BoxCoderKernel<double>);
diff --git a/paddle/operators/box_coder_op.cu b/paddle/operators/box_coder_op.cu
deleted file mode 100644
index 98bd93457fafb49f2af5e1ff258fbfa9f9985600..0000000000000000000000000000000000000000
--- a/paddle/operators/box_coder_op.cu
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/box_coder_op.h"
-#include "paddle/platform/cuda_helper.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void EncodeCenterSizeKernel(const T* prior_box_data,
-                                       const T* prior_box_var_data,
-                                       const T* target_box_data, const int row,
-                                       const int col, const int len,
-                                       T* output) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < row * col) {
-    const int row_idx = idx / col;
-    const int col_idx = idx % col;
-    T prior_box_width =
-        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
-    T prior_box_height =
-        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
-    T prior_box_center_x =
-        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
-    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
-                            prior_box_data[col_idx * len + 1]) /
-                           2;
-
-    T target_box_center_x =
-        (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) /
-        2;
-    T target_box_center_y = (target_box_data[row_idx * len + 3] +
-                             target_box_data[row_idx * len + 1]) /
-                            2;
-    T target_box_width =
-        target_box_data[row_idx * len + 2] - target_box_data[row_idx * len];
-    T target_box_height =
-        target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1];
-
-    output[idx * len] = (target_box_center_x - prior_box_center_x) /
-                        prior_box_width / prior_box_var_data[col_idx * len];
-    output[idx * len + 1] = (target_box_center_y - prior_box_center_y) /
-                            prior_box_height /
-                            prior_box_var_data[col_idx * len + 1];
-    output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) /
-                            prior_box_var_data[col_idx * len + 2];
-    output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) /
-                            prior_box_var_data[col_idx * len + 3];
-  }
-}
-
-template <typename T>
-__global__ void DecodeCenterSizeKernel(const T* prior_box_data,
-                                       const T* prior_box_var_data,
-                                       const T* target_box_data, const int row,
-                                       const int col, const int len,
-                                       T* output) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < row * col) {
-    const int row_idx = idx / col;
-    const int col_idx = idx % col;
-    T prior_box_width =
-        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
-    T prior_box_height =
-        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
-    T prior_box_center_x =
-        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
-    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
-                            prior_box_data[col_idx * len + 1]) /
-                           2;
-
-    T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
-                             target_box_data[row_idx * len + 2]) *
-                         prior_box_width;
-    T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
-                              target_box_data[row_idx * len + 3]) *
-                          prior_box_height;
-    T target_box_center_x = prior_box_var_data[col_idx * len] *
-                                target_box_data[row_idx * len] *
-                                prior_box_width +
-                            prior_box_center_x;
-    T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
-                                target_box_data[row_idx * len + 1] *
-                                prior_box_height +
-                            prior_box_center_y;
-
-    output[idx * len] = target_box_center_x - target_box_width / 2;
-    output[idx * len + 1] = target_box_center_y - target_box_height / 2;
-    output[idx * len + 2] = target_box_center_x + target_box_width / 2;
-    output[idx * len + 3] = target_box_center_y + target_box_height / 2;
-  }
-}
-
-template <typename T>
-class BoxCoderCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
-    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
-    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
-    auto* output_box = context.Output<framework::Tensor>("OutputBox");
-
-    if (target_box->lod().size()) {
-      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
-                        "Only support 1 level of LoD.");
-    }
-    auto row = target_box->dims()[0];
-    auto col = prior_box->dims()[0];
-    auto len = prior_box->dims()[1];
-    int block = 512;
-    int grid = (row * col + block - 1) / block;
-    auto& device_ctx = context.cuda_device_context();
-
-    const T* prior_box_data = prior_box->data<T>();
-    const T* prior_box_var_data = prior_box_var->data<T>();
-    const T* target_box_data = target_box->data<T>();
-
-    output_box->mutable_data<T>({row, col, len}, context.GetPlace());
-    T* output = output_box->data<T>();
-
-    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
-    if (code_type == BoxCodeType::kEncodeCenterSize) {
-      EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
-          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
-          output);
-    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
-      DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
-          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
-          output);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel<float>,
-                        ops::BoxCoderCUDAKernel<double>);
diff --git a/paddle/operators/box_coder_op.h b/paddle/operators/box_coder_op.h
deleted file mode 100644
index 086251f6e066f082743f332ce72918c6e572ce19..0000000000000000000000000000000000000000
--- a/paddle/operators/box_coder_op.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };
-
-inline BoxCodeType GetBoxCodeType(const std::string& type) {
-  if (type == "encode_center_size") {
-    return BoxCodeType::kEncodeCenterSize;
-  } else if (type == "decode_center_size") {
-    return BoxCodeType::kDecodeCenterSize;
-  }
-  PADDLE_THROW("Not support type %s.", type);
-}
-
-template <typename T>
-class BoxCoderKernel : public framework::OpKernel<T> {
- public:
-  void EncodeCenterSize(const framework::Tensor& target_box,
-                        const framework::Tensor& prior_box,
-                        const framework::Tensor& prior_box_var,
-                        T* output) const {
-    int64_t row = target_box.dims()[0];
-    int64_t col = prior_box.dims()[0];
-    int64_t len = prior_box.dims()[1];
-    auto* target_box_data = target_box.data<T>();
-    auto* prior_box_data = prior_box.data<T>();
-    auto* prior_box_var_data = prior_box_var.data<T>();
-
-    for (int64_t i = 0; i < row; ++i) {
-      for (int64_t j = 0; j < col; ++j) {
-        T prior_box_width =
-            prior_box_data[j * len + 2] - prior_box_data[j * len];
-        T prior_box_height =
-            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
-        T prior_box_center_x =
-            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
-        T prior_box_center_y =
-            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
-
-        T target_box_center_x =
-            (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
-        T target_box_center_y =
-            (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
-        T target_box_width =
-            target_box_data[i * len + 2] - target_box_data[i * len];
-        T target_box_height =
-            target_box_data[i * len + 3] - target_box_data[i * len + 1];
-
-        size_t offset = i * col * len + j * len;
-        output[offset] = (target_box_center_x - prior_box_center_x) /
-                         prior_box_width / prior_box_var_data[j * len];
-        output[offset + 1] = (target_box_center_y - prior_box_center_y) /
-                             prior_box_height / prior_box_var_data[j * len + 1];
-        output[offset + 2] =
-            std::log(std::fabs(target_box_width / prior_box_width)) /
-            prior_box_var_data[j * len + 2];
-        output[offset + 3] =
-            std::log(std::fabs(target_box_height / prior_box_height)) /
-            prior_box_var_data[j * len + 3];
-      }
-    }
-  }
-  void DecodeCenterSize(const framework::Tensor& target_box,
-                        const framework::Tensor& prior_box,
-                        const framework::Tensor& prior_box_var,
-                        T* output) const {
-    int64_t row = target_box.dims()[0];
-    int64_t col = prior_box.dims()[0];
-    int64_t len = prior_box.dims()[1];
-
-    auto* target_box_data = target_box.data<T>();
-    auto* prior_box_data = prior_box.data<T>();
-    auto* prior_box_var_data = prior_box_var.data<T>();
-
-    for (int64_t i = 0; i < row; ++i) {
-      for (int64_t j = 0; j < col; ++j) {
-        T prior_box_width =
-            prior_box_data[j * len + 2] - prior_box_data[j * len];
-        T prior_box_height =
-            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
-        T prior_box_center_x =
-            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
-        T prior_box_center_y =
-            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
-
-        T target_box_center_x = prior_box_var_data[j * len] *
-                                    target_box_data[i * len] * prior_box_width +
-                                prior_box_center_x;
-        T target_box_center_y = prior_box_var_data[j * len + 1] *
-                                    target_box_data[i * len + 1] *
-                                    prior_box_height +
-                                prior_box_center_y;
-        T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
-                                      target_box_data[i * len + 2]) *
-                             prior_box_width;
-        T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
-                                       target_box_data[i * len + 3]) *
-                              prior_box_height;
-
-        size_t offset = i * col * len + j * len;
-        output[offset] = target_box_center_x - target_box_width / 2;
-        output[offset + 1] = target_box_center_y - target_box_height / 2;
-        output[offset + 2] = target_box_center_x + target_box_width / 2;
-        output[offset + 3] = target_box_center_y + target_box_height / 2;
-      }
-    }
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
-    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
-    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
-    auto* output_box = context.Output<framework::Tensor>("OutputBox");
-
-    if (target_box->lod().size()) {
-      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL,
-                        "Only support 1 level of LoD.");
-    }
-    auto row = target_box->dims()[0];
-    auto col = prior_box->dims()[0];
-    auto len = prior_box->dims()[1];
-
-    output_box->mutable_data<T>({row, col, len}, context.GetPlace());
-
-    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
-    T* output = output_box->data<T>();
-    if (code_type == BoxCodeType::kEncodeCenterSize) {
-      EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
-    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
-      DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/cast_op.cc b/paddle/operators/cast_op.cc
deleted file mode 100644
index 446976edafca56f3c56fe573c8b5ef76a333089f..0000000000000000000000000000000000000000
--- a/paddle/operators/cast_op.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/cast_op.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CastOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input tensor of cast op");
-    AddOutput("Out", "The output tensor of cast op");
-    AddAttr<int>("out_dtype", "output data type");
-    AddAttr<int>("in_dtype", "input data type");
-    AddComment(R"DOC(
-Cast Operator.
-
-This Operator casts the input tensor to another data type and
-returns tha Output Tensor.
-
-)DOC");
-  }
-};
-
-class CastOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"), "The input of cast op must be set");
-    PADDLE_ENFORCE(context->HasOutput("Out"),
-                   "The output of cast op must be set");
-    context->SetOutputDim("Out", context->GetInputDim("X"));
-    context->ShareLoD("X", "Out");
-  }
-};
-
-class CastOpGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto grad = new framework::OpDesc();
-    grad->SetType("cast");
-    grad->SetInput("X", OutputGrad("Out"));
-    grad->SetOutput("Out", InputGrad("X"));
-    grad->SetAttr("out_dtype", GetAttr("in_dtype"));
-    grad->SetAttr("in_dtype", GetAttr("out_dtype"));
-    return std::unique_ptr<framework::OpDesc>(grad);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
-REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape,
-                        ops::CastOpProtoMaker);
-REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
-                       ops::CastOpKernel<CPU, double>,
-                       ops::CastOpKernel<CPU, int>,
-                       ops::CastOpKernel<CPU, int64_t>,
-                       ops::CastOpKernel<CPU, bool>);
diff --git a/paddle/operators/cast_op.cu b/paddle/operators/cast_op.cu
deleted file mode 100644
index d68bbe6e39a2fbaa92787731145ae324288b981a..0000000000000000000000000000000000000000
--- a/paddle/operators/cast_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/cast_op.h"
-
-template <typename T>
-using CastOpKernel =
-    paddle::operators::CastOpKernel<paddle::platform::CUDADeviceContext, T>;
-
-REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
-                        CastOpKernel<int>, CastOpKernel<int64_t>,
-                        CastOpKernel<bool>);
diff --git a/paddle/operators/cast_op.h b/paddle/operators/cast_op.h
deleted file mode 100644
index 9f39d91edd49d236d74019ca81b42002e4f35d36..0000000000000000000000000000000000000000
--- a/paddle/operators/cast_op.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename InT, typename OutT>
-struct CastOpTransformFunctor {
-  HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
-};
-
-template <typename DeviceContext, typename InT>
-struct CastOpFunctor {
-  const framework::Tensor* in_;
-  framework::Tensor* out_;
-  const DeviceContext& ctx_;
-  CastOpFunctor(const framework::Tensor* in, framework::Tensor* out,
-                const DeviceContext& ctx)
-      : in_(in), out_(out), ctx_(ctx) {}
-
-  template <typename OutT>
-  void operator()() const {
-    auto* in_begin = in_->data<InT>();
-    auto numel = in_->numel();
-    auto* in_end = in_begin + numel;
-    auto* out_begin = out_->mutable_data<OutT>(ctx_.GetPlace());
-    platform::Transform<DeviceContext> trans;
-    trans(ctx_, in_begin, in_end, out_begin,
-          CastOpTransformFunctor<InT, OutT>());
-  }
-};
-
-template <typename DeviceContext, typename InT>
-class CastOpKernel : public framework::OpKernel<InT> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    framework::VisitDataType(
-        static_cast<framework::proto::DataType>(context.Attr<int>("out_dtype")),
-        CastOpFunctor<DeviceContext, InT>(
-            in, out, context.template device_context<DeviceContext>()));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/chunk_eval_op.cc b/paddle/operators/chunk_eval_op.cc
deleted file mode 100644
index 44f667aead9ac88fee57310e06e3192732a8d908..0000000000000000000000000000000000000000
--- a/paddle/operators/chunk_eval_op.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/chunk_eval_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ChunkEvalOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Inference"),
-                   "Input(Inference) of ChunkEvalOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"),
-                   "Input(Label) of ChunkEvalOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Precision"),
-                   "Output(Precision) of ChunkEvalOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Recall"),
-                   "Output(Recall) of ChunkEvalOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("F1-Score"),
-                   "Output(F1-Score) of ChunkEvalOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("NumInferChunks"),
-                   "Output(NumInferChunks) of ChunkEvalOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("NumLabelChunks"),
-                   "Output(NumLabelChunks) of ChunkEvalOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("NumCorrectChunks"),
-        "Output(NumCorrectChunks) of ChunkEvalOp should not be null.");
-
-    auto inference_dim = ctx->GetInputDim("Inference");
-    auto label_dim = ctx->GetInputDim("Label");
-
-    PADDLE_ENFORCE(inference_dim == label_dim,
-                   "Inference's shape must be the same as Label's shape.");
-
-    ctx->SetOutputDim("Precision", {1});
-    ctx->SetOutputDim("Recall", {1});
-    ctx->SetOutputDim("F1-Score", {1});
-    ctx->SetOutputDim("NumInferChunks", {1});
-    ctx->SetOutputDim("NumLabelChunks", {1});
-    ctx->SetOutputDim("NumCorrectChunks", {1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(framework::proto::DataType::FP32,
-                                   platform::CPUPlace());
-  }
-};
-
-class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ChunkEvalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Inference",
-             "(Tensor, default: Tensor<int64_t>). "
-             "Predictions from the network.");
-    AddInput("Label",
-             "(Tensor, default: Tensor<int64_t>). The true tag sequences.");
-    AddOutput("Precision",
-              "(float). The evaluated precision (called positive predictive "
-              "value) of chunks on the given mini-batch.");
-    AddOutput("Recall",
-              "(float). The evaluated recall (true positive rate or "
-              "sensitivity) of chunks on the given mini-batch.");
-    AddOutput("F1-Score",
-              "(float). The evaluated F1-Score on the given mini-batch.");
-    AddOutput("NumInferChunks",
-              "(int64_t). The number of chunks in Inference on the given "
-              "mini-batch.");
-    AddOutput(
-        "NumLabelChunks",
-        "(int64_t). The number of chunks in Label on the given mini-batch.");
-    AddOutput(
-        "NumCorrectChunks",
-        "(int64_t). The number of chunks both in Inference and Label on the "
-        "given mini-batch.");
-    AddAttr<int>("num_chunk_types",
-                 "(int). The number of chunk type. See below for details.");
-    AddAttr<std::string>(
-        "chunk_scheme",
-        "(string, default IOB). The labeling scheme indicating "
-        "how to encode the chunks. Must be IOB, IOE, IOBES or plain. See below "
-        "for details.")
-        .SetDefault("IOB");
-    AddAttr<std::vector<int>>("excluded_chunk_types",
-                              "(list<int>) A list including chunk type ids "
-                              "indicating chunk types that are not counted. "
-                              "See below for details.")
-        .SetDefault(std::vector<int>{});
-    AddComment(R"DOC(
-For some basics of chunking, please refer to
-‘Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>’.
-
-
-CheckEvalOp computes the precision, recall, and F1-score of chunk detection,
-and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
-Here is a NER example of labeling for these tagging schemes:
-
- 	     Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
-  IO:    I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
-  IOB:   B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
-  IOE:   I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
-  IOBES: B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
-
-There are three chunk types(named entity types) including PER(person), ORG(organization)
-and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
-
-Since the calculations actually use label ids rather than labels, extra attention
-should be paid when mapping labels to ids to make CheckEvalOp work. The key point
-is that the listed equations are satisfied by ids.
-
-    tag_type = label % num_tag_type
-    chunk_type = label / num_tag_type
-
-where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
-is the num of chunk types, and `tag_type` get its value from the following table.
-
-    Scheme Begin Inside End   Single
-     plain   0     -      -     -
-     IOB     0     1      -     -
-     IOE     -     0      1     -
-     IOBES   0     1      2     3
-
-Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG,
-PER and LOC. To satisfy the above equations, the label map can be like this:
-
-    B-ORG  0
-    I-ORG  1
-    B-PER  2
-    I-PER  3
-    B-LOC  4
-    I-LOC  5
-    O      6
-
-It’s not hard to verify the equations noting that the num of chunk types
-is 3 and the num of tag types in IOB scheme is 2. For example, the label
-id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
-I-LOC is 2, which consistent with the results from the equations.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(chunk_eval, ops::ChunkEvalOp,
-                             ops::ChunkEvalOpMaker);
-REGISTER_OP_CPU_KERNEL(chunk_eval,
-                       ops::ChunkEvalKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/chunk_eval_op.h b/paddle/operators/chunk_eval_op.h
deleted file mode 100644
index 300aff90c0af666a95b7d4a0329de709e48ceddb..0000000000000000000000000000000000000000
--- a/paddle/operators/chunk_eval_op.h
+++ /dev/null
@@ -1,236 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <set>
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename DeviceContext, typename T>
-class ChunkEvalKernel : public framework::OpKernel<T> {
- public:
-  struct Segment {
-    int begin;
-    int end;
-    int type;
-    bool operator==(const Segment& y) const {
-      return begin == y.begin && end == y.end && type == y.type;
-    }
-  };
-
-  void GetSegments(const int64_t* label, int length,
-                   std::vector<Segment>& segments, int num_chunk_types,
-                   int num_tag_types, int other_chunk_type, int tag_begin,
-                   int tag_inside, int tag_end, int tag_single) const {
-    segments.clear();
-    segments.reserve(length);
-    int chunk_start = 0;
-    bool in_chunk = false;
-    int tag = -1;
-    int type = other_chunk_type;
-    for (int i = 0; i < length; ++i) {
-      int prev_tag = tag;
-      int prev_type = type;
-      PADDLE_ENFORCE_LE(label[i], num_chunk_types * num_tag_types);
-      tag = label[i] % num_tag_types;
-      type = label[i] / num_tag_types;
-      if (in_chunk && ChunkEnd(prev_tag, prev_type, tag, type, other_chunk_type,
-                               tag_begin, tag_inside, tag_end, tag_single)) {
-        Segment segment{
-            chunk_start,  // begin
-            i - 1,        // end
-            prev_type,
-        };
-        segments.push_back(segment);
-        in_chunk = false;
-      }
-      if (ChunkBegin(prev_tag, prev_type, tag, type, other_chunk_type,
-                     tag_begin, tag_inside, tag_end, tag_single)) {
-        chunk_start = i;
-        in_chunk = true;
-      }
-    }
-    if (in_chunk) {
-      Segment segment{
-          chunk_start,  // begin
-          length - 1,   // end
-          type,
-      };
-      segments.push_back(segment);
-    }
-  }
-
-  bool ChunkEnd(int prev_tag, int prev_type, int tag, int type,
-                int other_chunk_type, int tag_begin, int tag_inside,
-                int tag_end, int tag_single) const {
-    if (prev_type == other_chunk_type) return false;
-    if (type == other_chunk_type) return true;
-    if (type != prev_type) return true;
-    if (prev_tag == tag_begin) return tag == tag_begin || tag == tag_single;
-    if (prev_tag == tag_inside) return tag == tag_begin || tag == tag_single;
-    if (prev_tag == tag_end) return true;
-    if (prev_tag == tag_single) return true;
-    return false;
-  }
-
-  bool ChunkBegin(int prev_tag, int prev_type, int tag, int type,
-                  int other_chunk_type, int tag_begin, int tag_inside,
-                  int tag_end, int tag_single) const {
-    if (prev_type == other_chunk_type) return type != other_chunk_type;
-    if (type == other_chunk_type) return false;
-    if (type != prev_type) return true;
-    if (tag == tag_begin) return true;
-    if (tag == tag_inside) return prev_tag == tag_end || prev_tag == tag_single;
-    if (tag == tag_end) return prev_tag == tag_end || prev_tag == tag_single;
-    if (tag == tag_single) return true;
-    return false;
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    // initialize to parse configurations
-    int num_chunk_types, num_tag_types;
-    int other_chunk_type;
-    int tag_begin, tag_inside, tag_end, tag_single;
-    std::vector<Segment> label_segments;
-    std::vector<Segment> output_segments;
-    std::set<int> excluded_chunk_types;
-
-    if (context.Attr<std::string>("chunk_scheme") == "IOB") {
-      num_tag_types = 2;
-      tag_begin = 0;
-      tag_inside = 1;
-      tag_end = -1;
-      tag_single = -1;
-    } else if (context.Attr<std::string>("chunk_scheme") == "IOE") {
-      num_tag_types = 2;
-      tag_begin = -1;
-      tag_inside = 0;
-      tag_end = 1;
-      tag_single = -1;
-    } else if (context.Attr<std::string>("chunk_scheme") == "IOBES") {
-      num_tag_types = 4;
-      tag_begin = 0;
-      tag_inside = 1;
-      tag_end = 2;
-      tag_single = 3;
-    } else if (context.Attr<std::string>("chunk_scheme") == "plain") {
-      num_tag_types = 1;
-      tag_begin = -1;
-      tag_inside = -1;
-      tag_end = -1;
-      tag_single = -1;
-    } else {
-      PADDLE_THROW("Unknown chunk scheme.");
-    }
-    other_chunk_type = num_chunk_types = context.Attr<int>("num_chunk_types");
-    excluded_chunk_types.insert(
-        context.Attr<std::vector<int>>("excluded_chunk_types").begin(),
-        context.Attr<std::vector<int>>("excluded_chunk_types").end());
-
-    auto* inference = context.Input<LoDTensor>("Inference");
-    auto place = inference->place();
-    auto* label = context.Input<LoDTensor>("Label");
-    auto* precision = context.Output<Tensor>("Precision");
-    auto* recall = context.Output<Tensor>("Recall");
-    auto* f1 = context.Output<Tensor>("F1-Score");
-    auto* num_infer_chunks = context.Output<Tensor>("NumInferChunks");
-    auto* num_label_chunks = context.Output<Tensor>("NumLabelChunks");
-    auto* num_correct_chunks = context.Output<Tensor>("NumCorrectChunks");
-
-    const int64_t* inference_data = inference->data<int64_t>();
-    const int64_t* label_data = label->data<int64_t>();
-    T* precision_data = precision->mutable_data<T>(place);
-    T* racall_data = recall->mutable_data<T>(place);
-    T* f1_data = f1->mutable_data<T>(place);
-    int64_t* num_infer_chunks_data =
-        num_infer_chunks->mutable_data<int64_t>(place);
-    int64_t* num_label_chunks_data =
-        num_label_chunks->mutable_data<int64_t>(place);
-    int64_t* num_correct_chunks_data =
-        num_correct_chunks->mutable_data<int64_t>(place);
-    *num_infer_chunks_data = 0;
-    *num_label_chunks_data = 0;
-    *num_correct_chunks_data = 0;
-
-    auto lod = label->lod();
-    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
-    PADDLE_ENFORCE(lod == inference->lod(),
-                   "LoD must be same between Inference and Label.");
-    int num_sequences = lod[0].size() - 1;
-    for (int i = 0; i < num_sequences; ++i) {
-      int seq_length = lod[0][i + 1] - lod[0][i];
-      EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length,
-                 output_segments, label_segments, *num_infer_chunks_data,
-                 *num_label_chunks_data, *num_correct_chunks_data,
-                 num_chunk_types, num_tag_types, other_chunk_type, tag_begin,
-                 tag_inside, tag_end, tag_single, excluded_chunk_types);
-    }
-    *precision_data = !(*num_infer_chunks_data)
-                          ? 0
-                          : static_cast<T>(*num_correct_chunks_data) /
-                                (*num_infer_chunks_data);
-    *racall_data = !(*num_label_chunks_data)
-                       ? 0
-                       : static_cast<T>(*num_correct_chunks_data) /
-                             (*num_label_chunks_data);
-    *f1_data = !(*num_correct_chunks_data)
-                   ? 0
-                   : 2 * (*precision_data) * (*racall_data) /
-                         ((*precision_data) + (*racall_data));
-  }
-
-  void EvalOneSeq(const int64_t* output, const int64_t* label, int length,
-                  std::vector<Segment>& output_segments,
-                  std::vector<Segment>& label_segments,
-                  int64_t& num_output_segments, int64_t& num_label_segments,
-                  int64_t& num_correct, int num_chunk_types, int num_tag_types,
-                  int other_chunk_type, int tag_begin, int tag_inside,
-                  int tag_end, int tag_single,
-                  const std::set<int>& excluded_chunk_types) const {
-    GetSegments(output, length, output_segments, num_chunk_types, num_tag_types,
-                other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
-    GetSegments(label, length, label_segments, num_chunk_types, num_tag_types,
-                other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
-    size_t i = 0, j = 0;
-    while (i < output_segments.size() && j < label_segments.size()) {
-      if (output_segments[i] == label_segments[j] &&
-          excluded_chunk_types.count(output_segments[i].type) != 1) {
-        ++num_correct;
-      }
-      if (output_segments[i].end < label_segments[j].end) {
-        ++i;
-      } else if (output_segments[i].end > label_segments[j].end) {
-        ++j;
-      } else {
-        ++i;
-        ++j;
-      }
-    }
-    for (auto& segment : label_segments) {
-      if (excluded_chunk_types.count(segment.type) != 1) ++num_label_segments;
-    }
-    for (auto& segment : output_segments) {
-      if (excluded_chunk_types.count(segment.type) != 1) ++num_output_segments;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/clip_by_norm_op.cc b/paddle/operators/clip_by_norm_op.cc
deleted file mode 100644
index b90921d79baa920f0b6f92cde2f7e1ca9183d0d2..0000000000000000000000000000000000000000
--- a/paddle/operators/clip_by_norm_op.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/clip_by_norm_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ClipByNormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ClipByNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ClipByNormOp should not be null.");
-    auto max_norm = ctx->Attrs().Get<float>("max_norm");
-    PADDLE_ENFORCE_GT(max_norm, 0, "max_norm should be greater than 0.");
-    auto x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ClipByNormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(Tensor) The input of clip_by_norm op."
-             "The number of dimensions must be between [1, 9].");
-    AddOutput("Out",
-              "(Tensor) The output of clip_by_norm op with shape as input(X)");
-    AddAttr<float>("max_norm", "(float) The maximum norm value.");
-    AddComment(R"DOC(
-ClipByNorm Operator.
-
-This operator limits the L2 norm of the input $X$ within $max\_norm$.
-If the L2 norm of $X$ is less than or equal to $max\_norm$, $Out$ will be
-the same as $X$. If the L2 norm of $X$ is greater than $max\_norm$, $X$ will
-be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as
-shown in the following formula:
-
-$$
-Out = \frac{max\_norm * X}{norm(X)},
-$$
-
-where $norm(X)$ represents the L2 norm of $X$.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp,
-                             ops::ClipByNormOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    clip_by_norm,
-    ops::ClipByNormKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/clip_by_norm_op.cu b/paddle/operators/clip_by_norm_op.cu
deleted file mode 100644
index cbf8fa44133739f948fed13e18fc5cbaabd3abb0..0000000000000000000000000000000000000000
--- a/paddle/operators/clip_by_norm_op.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/clip_by_norm_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    clip_by_norm,
-    ops::ClipByNormKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/clip_by_norm_op.h b/paddle/operators/clip_by_norm_op.h
deleted file mode 100644
index 87956a707cf58afa2336602b8ab6acf73b0ff814..0000000000000000000000000000000000000000
--- a/paddle/operators/clip_by_norm_op.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class ClipByNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto max_norm = context.Attr<T>("max_norm");
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<T>(context.GetPlace());
-
-    auto x = EigenVector<T>::Flatten(*input);
-    auto out = EigenVector<T>::Flatten(*output);
-    auto x_norm = x.square().sum().sqrt();
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto temp = (x_norm <= max_norm).template cast<T>().eval();
-    auto scaling = temp + (static_cast<T>(1) - temp) * max_norm / x_norm;
-    Eigen::array<int, 1> one_dim{{1}};
-    Eigen::DSizes<int, 1> m_dsize(input->numel());
-    out.device(place) = x * scaling.reshape(one_dim).broadcast(m_dsize);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc
deleted file mode 100644
index 7adb74eab78dcdd0251b8db60781f6e24e348634..0000000000000000000000000000000000000000
--- a/paddle/operators/clip_op.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/clip_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ClipOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ClipOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ClipOp should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto max = ctx->Attrs().Get<float>("max");
-    auto min = ctx->Attrs().Get<float>("min");
-    PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-template <typename AttrType>
-class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(Tensor)The input of clip op."
-             "The number of dimensions must be between [1, 9].");
-    AddOutput("Out", "(Tensor)The output of clip op with shape as input(X)");
-    AddAttr<AttrType>(
-        "min", "(float)Minimum value, under which element is replaced by min.");
-    AddAttr<AttrType>(
-        "max", "(float)Maximum value, above which element is replaced by max");
-    AddComment(R"DOC(
-Clip Operator.
-
-The clip operator limits the value of given input within an interval. The
-interval is specified with arguments 'min' and 'max':
-
-$$
-Out = \min(\max(X, min), max)
-$$
-
-)DOC");
-  }
-};
-
-class ClipOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx->GetInputDim("X");
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
-            ops::ClipOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/clip_op.cu b/paddle/operators/clip_op.cu
deleted file mode 100644
index 5ccbc9643407c65e8734711744ceac9814f4c6a2..0000000000000000000000000000000000000000
--- a/paddle/operators/clip_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/clip_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    clip, ops::ClipKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    clip_grad, ops::ClipGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/clip_op.h b/paddle/operators/clip_op.h
deleted file mode 100644
index 51db185dffd80cc3b839d063acaf3f936d732817..0000000000000000000000000000000000000000
--- a/paddle/operators/clip_op.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using platform::Transform;
-
-template <typename T>
-class ClipFunctor {
- public:
-  explicit ClipFunctor(const T min, const T max) : min_(min), max_(max) {}
-  HOSTDEVICE T operator()(const T& x) const {
-    if (x < min_)
-      return min_;
-    else if (x > max_)
-      return max_;
-    else
-      return x;
-  }
-
- private:
-  T min_;
-  T max_;
-};
-
-template <typename T>
-class ClipGradFunctor {
- public:
-  explicit ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {}
-  HOSTDEVICE T operator()(const T& x, const T& y) const {
-    return (y > min_ && y < max_) ? x : 0;
-  }
-
- private:
-  T min_;
-  T max_;
-};
-
-template <typename DeviceContext, typename T>
-class ClipKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto max = context.Attr<T>("max");
-    auto min = context.Attr<T>("min");
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    const T* x_data = x->data<T>();
-    int64_t numel = x->numel();
-    Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), x_data,
-          x_data + numel, out_data, ClipFunctor<T>(min, max));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ClipGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto max = context.Attr<T>("max");
-    auto min = context.Attr<T>("min");
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-    if (d_x != nullptr) {
-      auto* x = context.Input<Tensor>("X");
-      int64_t numel = d_out->numel();
-      auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
-      const T* d_out_data = d_out->data<T>();
-      const T* x_data = x->data<T>();
-      Transform<DeviceContext> trans;
-      trans(context.template device_context<DeviceContext>(), d_out_data,
-            d_out_data + numel, x_data, d_x_data, ClipGradFunctor<T>(min, max));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/compare_op.cc b/paddle/operators/compare_op.cc
deleted file mode 100644
index 930c295a9cb31238954efeb87ff5ac2d3ca7bdc6..0000000000000000000000000000000000000000
--- a/paddle/operators/compare_op.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/compare_op.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-template <typename OpComment>
-class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CompareOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    OpComment comment;
-    AddInput("X",
-             string::Sprintf("(LoDTensor) the left hand operand of %s operator",
-                             comment.type));
-    AddInput("Y", string::Sprintf(
-                      "(LoDTensor) the right hand operand of %s operator",
-                      comment.type));
-    AddOutput("Out", string::Sprintf(
-                         "(LoDTensor) n-dim bool tensor. Each element is %s",
-                         comment.equation));
-    AddComment(string::Sprintf(R"DOC(%s Operator
-
-It operates element-wise on X and Y, and returns the Out. Each of them is a
-N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
-calculated by %s
-)DOC",
-                               comment.type, comment.equation));
-    AddAttr<int>("axis",
-                 "(int, default -1). The start dimension index "
-                 "for broadcasting Y onto X.")
-        .SetDefault(-1)
-        .EqualGreaterThan(-1);
-  }
-};
-
-template <typename OpComment>
-class CompareOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    OpComment comment;
-    PADDLE_ENFORCE(context->HasInput("X"), "%s operator must has input X",
-                   comment.type);
-    PADDLE_ENFORCE(context->HasInput("Y"), "%s operator must has input Y",
-                   comment.type);
-    auto dim_x = context->GetInputDim("X");
-    auto dim_y = context->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(framework::product(dim_x), framework::product(dim_y),
-                      "The number of elements in X and Y should be same");
-
-    context->SetOutputDim("Out", context->GetInputDim("X"));
-    context->ShareLoD("X", "Out");
-  }
-};
-
-class CompareOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
-    // CompareOp kernel's device type is decided by input tensor place
-    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
-    return kt;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_LOGICAL_OP(op_type, _equation)                      \
-  struct _##op_type##Comment {                                       \
-    static char type[];                                              \
-    static char equation[];                                          \
-  };                                                                 \
-  char _##op_type##Comment::type[]{#op_type};                        \
-  char _##op_type##Comment::equation[]{_equation};                   \
-  REGISTER_OPERATOR(                                                 \
-      op_type, ::paddle::operators::CompareOp,                       \
-      ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \
-      ::paddle::operators::CompareOpInferShape<_##op_type##Comment>, \
-      ::paddle::framework::EmptyGradOpMaker);
-
-REGISTER_LOGICAL_OP(less_than, "Out = X < Y");
-REGISTER_LOGICAL_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
-REGISTER_LOGICAL_OP(less_equal, "Out = X <= Y");
-REGISTER_LOGICAL_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
-REGISTER_LOGICAL_OP(equal, "Out = X == Y");
-REGISTER_LOGICAL_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
diff --git a/paddle/operators/compare_op.cu b/paddle/operators/compare_op.cu
deleted file mode 100644
index f625824dbc99d603f1e92700b4ad3d7fa25b471d..0000000000000000000000000000000000000000
--- a/paddle/operators/compare_op.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/compare_op.h"
-
-REGISTER_LOGICAL_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
-REGISTER_LOGICAL_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);
-REGISTER_LOGICAL_KERNEL(equal, CUDA, paddle::operators::EqualFunctor);
diff --git a/paddle/operators/compare_op.h b/paddle/operators/compare_op.h
deleted file mode 100644
index b275fd75b3512343825170fc38565dd27f7f1c75..0000000000000000000000000000000000000000
--- a/paddle/operators/compare_op.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <type_traits>
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/elementwise_op_function.h"
-#include "paddle/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct LessThanFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a < b; }
-};
-
-template <typename T>
-struct LessEqualFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a <= b; }
-};
-
-template <typename T>
-struct EqualFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const {
-    if (std::is_floating_point<T>::value) {
-      // This branch will be optimized while compiling if T is integer. It is
-      // safe to cast a and b to double.
-      return fabs(static_cast<double>(a - b)) < 1e-8;
-    } else {
-      return (a == b);
-    }
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class CompareOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    using Tensor = framework::Tensor;
-
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* z = context.Output<Tensor>("Out");
-    z->mutable_data<T>(context.GetPlace());
-    int axis = context.Attr<int>("axis");
-    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
-                                                          z);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor)                    \
-  REGISTER_OP_##dev##_KERNEL(                                             \
-      op_type, ::paddle::operators::CompareOpKernel<                      \
-                   ::paddle::platform::dev##DeviceContext, functor<int>>, \
-      ::paddle::operators::CompareOpKernel<                               \
-          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,      \
-      ::paddle::operators::CompareOpKernel<                               \
-          ::paddle::platform::dev##DeviceContext, functor<float>>,        \
-      ::paddle::operators::CompareOpKernel<                               \
-          ::paddle::platform::dev##DeviceContext, functor<double>>);
diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc
deleted file mode 100644
index 32b61edfd0dd163e5ef8f3d1de133c55314458b5..0000000000000000000000000000000000000000
--- a/paddle/operators/concat_op.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/concat_op.h"
-#include <vector>
-
-namespace paddle {
-namespace operators {
-using framework::Tensor;
-
-class ConcatOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
-                      "Inputs(X) of ConcatOp should be empty.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ConcatOp should not be null.");
-
-    auto ins = ctx->GetInputsDim("X");
-    size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
-    const size_t n = ins.size();
-
-    PADDLE_ENFORCE_GT(n, 1, "Input tensors count should > 1.");
-
-    auto out_dims = ins[0];
-    size_t in_zero_dims_size = out_dims.size();
-    for (size_t i = 1; i < n; i++) {
-      for (size_t j = 0; j < in_zero_dims_size; j++) {
-        if (j == axis) {
-          out_dims[axis] += ins[i][j];
-        } else {
-          PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
-                            "Input tensors should have the same "
-                            "elements except the specify axis.");
-        }
-      }
-    }
-    if (out_dims[axis] < 0) {
-      out_dims[axis] = -1;
-    }
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ConcatOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input tensors of concat operator.").AsDuplicable();
-    AddOutput("Out", "Output tensor of concat operator.");
-    AddAttr<int>("axis",
-                 "The axis along which the input tensors will be concatenated.")
-        .SetDefault(0);
-    AddComment(R"DOC(
-Concat Operator.
-
-Concatenate the input tensors along dimension axis.
-Examples:
-  Input[0] = [[1,2],[3,4]]
-  Input[1] = [[5,6]]
-  axis = 0
-  Output = [[1,2],
-            [3,4],
-            [5,6]]
-
-)DOC");
-  }
-};
-
-class ConcatOpGrad : public framework::OperatorWithKernel {
- public:
-  ConcatOpGrad(const std::string &type,
-               const framework::VariableNameMap &inputs,
-               const framework::VariableNameMap &outputs,
-               const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
-               ops::ConcatOpGrad, false)
-REGISTER_OP_CPU_KERNEL(concat,
-                       ops::ConcatKernel<paddle::platform::CPUPlace, float>)
-REGISTER_OP_CPU_KERNEL(concat_grad,
-                       ops::ConcatGradKernel<paddle::platform::CPUPlace, float>)
diff --git a/paddle/operators/concat_op.cu.cc b/paddle/operators/concat_op.cu.cc
deleted file mode 100644
index 7b46452d3d5db58799923a3dc76bb9df3471d9e7..0000000000000000000000000000000000000000
--- a/paddle/operators/concat_op.cu.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/concat_op.h"
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    concat, ops::ConcatKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    concat_grad,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/concat_op.h b/paddle/operators/concat_op.h
deleted file mode 100644
index de4011585af81363368a096a5c361ff3f7aeecdb..0000000000000000000000000000000000000000
--- a/paddle/operators/concat_op.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/strided_memcpy.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ConcatKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
-    const size_t n = ins.size();
-    size_t output_offset = 0;
-    out->mutable_data<T>(ctx.GetPlace());
-    auto out_stride = framework::stride(out->dims());
-    for (size_t i = 0; i < n; i++) {
-      auto& in = ins[i];
-      auto axis_dim = in->dims()[axis];
-      auto in_stride = framework::stride(in->dims());
-      StridedMemcpy<T>(ctx.device_context(), in->data<T>(), in_stride,
-                       in->dims(), out_stride, out->data<T>() + output_offset);
-      output_offset += axis_dim * in_stride[axis];
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ConcatGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
-    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
-    const size_t n = outs.size();
-    size_t input_offset = 0;
-    auto in_stride = framework::stride(in->dims());
-    for (size_t i = 0; i < n; i++) {
-      auto& out = outs[i];
-      out->mutable_data<T>(ctx.GetPlace());
-      size_t axis_dim = out->dims()[axis];
-      auto out_stride = framework::stride(out->dims());
-      StridedMemcpy<T>(ctx.device_context(), in->data<T>() + input_offset,
-                       in_stride, out->dims(), out_stride, out->data<T>());
-      input_offset += axis_dim * in_stride[axis];
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
deleted file mode 100644
index e333002bfd1ab40c62882f09cd207a12a0939648..0000000000000000000000000000000000000000
--- a/paddle/operators/cond_op.cc
+++ /dev/null
@@ -1,235 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/cond_op.h"
-#include "paddle/operators/gather.h"
-#include "paddle/operators/scatter.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-using Scope = framework::Scope;
-using Variable = framework::Variable;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DDim = framework::DDim;
-
-framework::Scope& CondOp::AddSubScope(const Scope& scope) const {
-  auto sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
-                          "Output(SubScopes) of CondOp should not be null.");
-  auto sub_scopes = sub_scopes_var->GetMutable<std::vector<Scope*>>();
-  auto& sub_scope = scope.NewScope();
-  sub_scopes->push_back(&sub_scope);
-  return sub_scope;
-}
-
-std::vector<framework::Scope*>& CondOp::GetSubScopes(
-    const framework::Scope& scope) const {
-  auto sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
-                          "Output(SubScopes) of CondOp should not be null.");
-  return *sub_scopes_var->GetMutable<std::vector<framework::Scope*>>();
-}
-
-LoDTensor& CondOp::AddIndexTensor(const Scope& scope) const {
-  auto index_tensors_var = scope.FindVar("IndexTensors");
-  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
-                          "Output(IndexTensors) of CondOp should not be null.");
-  auto& index_tensors =
-      *index_tensors_var->GetMutable<std::vector<LoDTensor>>();
-  index_tensors.push_back(LoDTensor());
-  return index_tensors.back();
-}
-
-std::vector<framework::LoDTensor>& CondOp::GetIndexTensors(
-    const framework::Scope& scope) const {
-  auto* index_tensors_var = scope.FindVar("IndexTensors");
-  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
-                          "Output(IndexTensors) of CondOp should not be null.");
-  return *index_tensors_var->GetMutable<std::vector<framework::LoDTensor>>();
-}
-
-void CondOp::PrepareDataForSubnet(
-    const framework::Scope& scope,
-    const platform::DeviceContext& dev_ctx) const {
-  PADDLE_ENFORCE(!Inputs("Xs").empty(), "Inputs(Xs) of CondOp can't be empty.");
-
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    // Create two sub scopes for true and false branches
-    //   sub_scopes[0] for the true branch
-    //   sub_scopes[1] for the false branch
-    AddSubScope(scope);
-    // Create two tensors for true and false indices:
-    //   index_tensors[0] for the true branch
-    //   index_tensors[1] for the false branch
-    AddIndexTensor(scope);
-  }
-
-  Variable* cond_var = scope.FindVar(Input("Cond"));
-  PADDLE_ENFORCE_NOT_NULL(cond_var,
-                          "Input(Cond) of CondOp should not be null.");
-  const LoDTensor* cond = cond_var->GetMutable<LoDTensor>();
-
-  // get the true/false index at runtime according to cond tensor
-  // index_vectors[0]: vector<int>, contains all index for cond[i] == true
-  // index_vectors[1]: vector<int>, contains all index for cond[i] == false
-  std::vector<std::vector<int>> index_vectors;
-  index_vectors.resize(BRANCH_NUM);
-
-  const int* cond_data = cond->data<int>();
-  for (int i = 0; i < cond->dims()[0]; ++i) {
-    if (cond_data[i])
-      index_vectors[TRUE_BRANCH].push_back(i);
-    else
-      index_vectors[FALSE_BRANCH].push_back(i);
-  }
-
-  // put index_vectors[0] and index_vectors[1] into two tensors:
-  // index_tensors[0] and index_tensors[1]
-  std::vector<framework::LoDTensor>& index_tensors = GetIndexTensors(scope);
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    DDim dim = {static_cast<int64_t>(index_vectors[i].size())};
-    int* index_tensor_data_ptr =
-        index_tensors[i].mutable_data<int>(dim, platform::CPUPlace());
-    memcpy(index_tensor_data_ptr, index_vectors[i].data(),
-           dim[0] * sizeof(int));
-  }
-
-  // create input in subscopes according to index_vectors
-  for (auto& input : Inputs("Xs")) {
-    Variable* var_parent = scope.FindVar(input);
-    PADDLE_ENFORCE_NOT_NULL(var_parent);
-    const auto* tensor_parent = &var_parent->Get<LoDTensor>();
-
-    for (int i = 0; i < BRANCH_NUM; ++i) {
-      Variable* var_child = sub_scopes[i]->FindVar(input);
-      PADDLE_ENFORCE_NOT_NULL(var_child);
-      auto* tensor_child = var_child->GetMutable<LoDTensor>();
-
-      // Resize child
-      DDim dim = tensor_parent->dims();
-      dim[0] = index_tensors[i].dims()[0];
-      tensor_child->mutable_data<float>(dim, platform::CPUPlace());
-
-      CPUGather<float>(dev_ctx, *tensor_parent, index_tensors[i], tensor_child);
-    }
-  }
-
-  // create output_tensors in subscope for sub_net
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    for (auto& output : (*sub_net_op_[i]).Outputs()) {
-      for (auto& var_name : output.second) {
-        sub_scopes[i]->Var(var_name);
-      }
-    }
-  }
-}
-
-void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
-                                 const platform::DeviceContext& dev_ctx) const {
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-  const std::vector<framework::LoDTensor>& index_tensors =
-      GetIndexTensors(scope);
-
-  // Infer the output dim, out_dim[0] = true_dim[0] + false_dim[0]
-  PADDLE_ENFORCE(!Outputs("Outs").empty(),
-                 "Outputs(Outs) of CondOp can't be empty.");
-  for (auto& output : Outputs("Outs")) {
-    const LoDTensor* tensor_t_out =
-        &sub_scopes[TRUE_BRANCH]->FindVar(output)->Get<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL");
-    const LoDTensor* tensor_f_out =
-        &sub_scopes[FALSE_BRANCH]->FindVar(output)->Get<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL");
-
-    auto* var_out = scope.FindVar(output);
-    PADDLE_ENFORCE_NOT_NULL(var_out, "Output not found");
-    LoDTensor* tensor_out = var_out->GetMutable<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out,
-                            "True output tensor should not be NULL");
-
-    DDim true_dim = tensor_t_out->dims();
-    DDim false_dim = tensor_f_out->dims();
-    true_dim[0] = 0;
-    false_dim[0] = 0;
-    PADDLE_ENFORCE_EQ(true_dim, false_dim,
-                      "Outputs not of the same shape except the first dim");
-
-    DDim out_dim = tensor_t_out->dims();
-    out_dim[0] = tensor_t_out->dims()[0] + tensor_f_out->dims()[0];
-    tensor_out->Resize(out_dim);
-    tensor_out->mutable_data<float>(platform::CPUPlace());
-  }
-
-  // merge output results:
-  // output_tensor = true_output_tensor + false_output_tensor
-  for (auto& output : Outputs("Outs")) {
-    Variable* var_parent = scope.FindVar(output);
-    PADDLE_ENFORCE_NOT_NULL(var_parent);
-    auto* tensor_parent = var_parent->GetMutable<LoDTensor>();
-
-    for (int i = 0; i < BRANCH_NUM; ++i) {
-      Variable* var_child = sub_scopes[i]->FindVar(output);
-      PADDLE_ENFORCE_NOT_NULL(var_child);
-      auto* tensor_child = &var_child->Get<LoDTensor>();
-      ScatterAssign<float>(dev_ctx, *tensor_child, index_tensors[i],
-                           tensor_parent);
-    }
-  }
-}
-
-void CondOp::Run(const Scope& scope, const platform::Place& place) const {
-  // get device context from pool
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& dev_ctx = *pool.Get(place);
-
-  PrepareDataForSubnet(scope, dev_ctx);
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    sub_net_op_[i]->Run(*sub_scopes[i], place);
-  }
-  MergeDataFromSubnet(scope, dev_ctx);
-}
-
-class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CondOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Cond", "The condition, which is a bool vector");
-    AddInput("Xs", "Inputs of Subnets").AsDuplicable();
-    AddOutput("Outs", "Outputs of Cond_Op after merge").AsDuplicable();
-
-    AddOutput("SubScopes", "sub scopes for true and false branches");
-    AddOutput("IndexTensors", "Index Tensors contains indices for true/false");
-
-    AddComment(R"DOC(
-Sample Dependent Conditional Operator.
-
-Given Cond[i] as a 1/0 vector to indicate true/false:
-Out[i] = subnet_true[i], if Cond[i] == true
-Out[i] = subnet_false[i], if Cond[i] == false
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_WITHOUT_GRADIENT(cond, paddle::operators::CondOp,
-                             paddle::operators::CondOpProtoAndCheckerMaker);
diff --git a/paddle/operators/cond_op.h b/paddle/operators/cond_op.h
deleted file mode 100644
index 7dcdc47e0b2ff216bea92d083fe5897009384d39..0000000000000000000000000000000000000000
--- a/paddle/operators/cond_op.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "glog/logging.h"
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/operators/net_op.h"
-
-namespace paddle {
-namespace operators {
-
-/*
- * @brief CondOp is a dynamic if-else Operator
- *
- * It has a input tensor named cond indicating which netop each instance will
- * run.
- *
- * if cond == 1, it will run true_net, which is a NetOp.
- *
- * if cond == 0, it will run false_net, which is another NetOp.
- */
-class CondOp : public framework::OperatorBase {
- public:
-  CondOp(const std::string& type, const framework::VariableNameMap& inputs,
-         const framework::VariableNameMap& outputs,
-         const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {
-    sub_net_op_.resize(BRANCH_NUM);
-  }
-
-  CondOp(const CondOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    // TODO(yuyang18): Implement copy ctor well.
-    PADDLE_THROW("Not implemented");
-  }
-
-  framework::Scope& AddSubScope(const framework::Scope& scope) const;
-  std::vector<framework::Scope*>& GetSubScopes(
-      const framework::Scope& scope) const;
-
-  framework::LoDTensor& AddIndexTensor(const framework::Scope& scope) const;
-  std::vector<framework::LoDTensor>& GetIndexTensors(
-      const framework::Scope& scope) const;
-
-  void PrepareDataForSubnet(const framework::Scope& scope,
-                            const platform::DeviceContext& dev_ctx) const;
-  void MergeDataFromSubnet(const framework::Scope& scope,
-                           const platform::DeviceContext& dev_ctx) const;
-
-  /*
-   * Set True Block
-   */
-  void set_truenet(std::unique_ptr<OperatorBase>&& net) {
-    sub_net_op_[TRUE_BRANCH] = std::move(net);
-  }
-
-  /*
-   * Set False Block
-   */
-  void set_falsenet(std::unique_ptr<OperatorBase>&& net) {
-    sub_net_op_[FALSE_BRANCH] = std::move(net);
-  }
-
-  void Run(const framework::Scope& scope,
-           const platform::Place& place) const override;
-
- private:
-  const int TRUE_BRANCH = 0;
-  const int FALSE_BRANCH = 1;
-  const int BRANCH_NUM = 2;
-
-  // sub_net_op_[0]: subnet_t
-  // sub_net_op_[1]: subnet_f
-  std::vector<std::unique_ptr<framework::OperatorBase>> sub_net_op_;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/conditional_block_op.cc b/paddle/operators/conditional_block_op.cc
deleted file mode 100644
index bdcdb85be7a94a748961048ac97e69f2f3b78677..0000000000000000000000000000000000000000
--- a/paddle/operators/conditional_block_op.cc
+++ /dev/null
@@ -1,229 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <algorithm>
-#include "paddle/framework/executor.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class ConditionalOp : public framework::OperatorBase {
- public:
-  ConditionalOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- protected:
-  std::vector<const framework::LoDTensor *> InputTensors(
-      const framework::Scope &scope) const {
-    std::vector<const framework::LoDTensor *> retv;
-    auto xs = Inputs("X");
-    retv.resize(xs.size(), nullptr);
-    std::transform(
-        xs.begin(), xs.end(), retv.begin(),
-        [&scope](const std::string &var_name) -> const framework::LoDTensor * {
-          auto *var = scope.FindVar(var_name);
-          PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", var_name);
-          return &var->Get<framework::LoDTensor>();
-        });
-    return retv;
-  }
-
-  bool ScalarCondition(
-      const std::vector<const framework::LoDTensor *> &ips) const {
-    if (!(ips.size() == 1UL && ips[0]->IsInitialized())) {
-      PADDLE_THROW("should have one initialized input as condition");
-    }
-    if (!(ips[0]->type().hash_code() == typeid(bool).hash_code() &&
-          ips[0]->numel() == 1)) {
-      PADDLE_THROW(
-          "condition input's data type should be bool, "
-          "numel should be 1, actual numel is %d",
-          ips[0]->numel());
-    }
-    return ips[0]->data<bool>()[0];
-  }
-};
-
-class ConditionalBlockOp : public ConditionalOp {
- public:
-  ConditionalBlockOp(const std::string &type,
-                     const framework::VariableNameMap &inputs,
-                     const framework::VariableNameMap &outputs,
-                     const framework::AttributeMap &attrs)
-      : ConditionalOp(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
-    auto xs = InputTensors(scope);
-
-    bool need_run;
-    if (Attr<bool>("is_scalar_condition")) {
-      need_run = ScalarCondition(xs);
-    } else {
-      need_run = std::all_of(
-          xs.begin(), xs.end(),
-          [](const framework::LoDTensor *t) { return t->numel() != 0; });
-    }
-
-    if (need_run) {
-      auto *scope_var = scope.FindVar(Output("Scope"));
-      PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
-      auto *scopes = scope_var->GetMutable<std::vector<framework::Scope *>>();
-      scopes->resize(1);
-      scopes->front() = &scope.NewScope();
-      auto &cur_scope = *scopes->front();
-
-      framework::Executor exec(dev_place);
-      auto *block = Attr<framework::BlockDesc *>("sub_block");
-      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
-    }
-  }
-};
-
-class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ConditionalBlockOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "The conditional variable of this operator. If X is empty, the "
-             "whole sub-block will not be executed.")
-        .AsDuplicable();
-    AddInput("Params", "The input variables of the sub-block.").AsDuplicable();
-    AddOutput("Out", "The output variables of the sub-block.").AsDuplicable();
-    AddOutput("Scope",
-              "(std::vector<Scope*>) The step scope of conditional block. To "
-              "unify the conditional block, rnn and while op, the type of "
-              "scope is std::vector<Scope*>");
-    AddAttr<framework::BlockDesc *>(
-        "sub_block", "The step block of conditional block operator");
-    AddAttr<bool>("is_scalar_condition",
-                  "the input X is used as scalar "
-                  "condition")
-        .SetDefault(false);
-    AddComment(R"DOC(Conditional block operator
-
-Run the sub-block if X is not empty. Params is the other inputs and Out is the
-outputs of the sub-block.
-)DOC");
-  }
-};
-
-class ConditionalBlockGradOp : public ConditionalOp {
- public:
-  ConditionalBlockGradOp(const std::string &type,
-                         const framework::VariableNameMap &inputs,
-                         const framework::VariableNameMap &outputs,
-                         const framework::AttributeMap &attrs)
-      : ConditionalOp(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
-    auto xs = this->InputTensors(scope);
-
-    bool need_run;
-    if (Attr<bool>("is_scalar_condition")) {
-      need_run = ScalarCondition(xs);
-    } else {
-      need_run = std::all_of(
-          xs.begin(), xs.end(),
-          [](const framework::LoDTensor *t) { return t->numel() != 0; });
-    }
-
-    if (need_run) {
-      auto *scope_var = scope.FindVar(Input("Scope"));
-      PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
-      auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
-      framework::Scope &cur_scope = *scopes[0];
-
-      framework::Executor exec(dev_place);
-      auto *block = Attr<framework::BlockDesc *>("sub_block");
-      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
-
-      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Params"),
-                                  Outputs(framework::GradVarName("Params")));
-
-      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("X"),
-                                  Outputs(framework::GradVarName("X")));
-    }
-  }
-
- private:
-  void AssignLocalGradientToGlobal(
-      const platform::Place &place, const framework::Scope &cur_scope,
-      const std::vector<std::string> &p_names,
-      const std::vector<std::string> &pg_names) const {
-    for (size_t i = 0; i < p_names.size(); ++i) {
-      auto out_grad_name = pg_names[i];
-      auto in_grad_name = framework::GradVarName(p_names[i]);
-      auto *in_var = cur_scope.FindVar(in_grad_name);
-      if (in_var == nullptr) {
-        continue;
-      }
-      auto new_in_grad_name = cur_scope.Rename(in_grad_name);
-      auto assign = framework::OpRegistry::CreateOp(
-          "assign", {{"X", {new_in_grad_name}}}, {{"Out", {out_grad_name}}},
-          framework::AttributeMap{});
-      assign->Run(cur_scope, place);
-      cur_scope.Rename(new_in_grad_name, in_grad_name);
-    }
-  }
-};
-
-class ConditionalBlockGradInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInputs("X"));
-    if (context->HasInputs("Params")) {
-      PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Params")));
-      context->SetOutputsDim(framework::GradVarName("Params"),
-                             context->GetInputsDim("Params"));
-    }
-    PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("X")));
-    context->SetOutputsDim(framework::GradVarName("X"),
-                           context->GetInputsDim("X"));
-  }
-};
-
-class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto grad_op = new framework::OpDesc();
-    grad_op->SetType("conditional_block_grad");
-    grad_op->SetInput("X", Input("X"));
-    grad_op->SetInput("Params", Input("Params"));
-    grad_op->SetInput("Out", Output("Out"));
-    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    grad_op->SetInput("Scope", Output("Scope"));
-    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
-    grad_op->SetOutput(framework::GradVarName("Params"),
-                       InputGrad("Params", false));
-    grad_op->SetBlockAttr("sub_block", *this->grad_block_[0]);
-    grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition"));
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(conditional_block, ops::ConditionalBlockOp,
-                  ops::ConditionalBlockOpProtoMaker,
-                  ops::ConditionalBlockGradMaker);
-REGISTER_OPERATOR(conditional_block_grad, ops::ConditionalBlockGradOp,
-                  ops::ConditionalBlockGradInferShape);
diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc
deleted file mode 100644
index 3a5409a7e3f29a4c46839d6395760fc7fe8c086e..0000000000000000000000000000000000000000
--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ /dev/null
@@ -1,330 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/memory/memory.h"
-#include "paddle/operators/conv_op.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/cudnn_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
-using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
-using DataLayout = platform::DataLayout;
-
-static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
-    static_cast<size_t>(1024) * 1024 * 1024;
-
-template <typename T>
-class CUDNNConvOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* filter = ctx.Input<Tensor>("Filter");
-    auto* output = ctx.Output<Tensor>("Output");
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    int64_t user_workspace_size =
-        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
-
-    const T* input_data = input->data<T>();
-    const T* filter_data = filter->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-
-    // ------------------- cudnn descriptors ---------------------
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor output_desc;
-    ScopedFilterDescriptor filter_desc;
-    ScopedConvolutionDescriptor conv_desc;
-    DataLayout layout = DataLayout::kNCHW;
-    if (input->dims().size() == 5) {
-      layout = DataLayout::kNCDHW;
-    }
-
-    cudnnConvolutionDescriptor_t cudnn_conv_desc =
-        conv_desc.descriptor<T>(paddings, strides, dilations);
-
-#if CUDNN_VERSION_MIN(7, 0, 1)
-    // cudnn 7 can support groups, no need to do it mannually
-    // FIXME(typhoonzero): find a better way to disable groups
-    // rather than setting it to 1.
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
-        cudnn_conv_desc, groups));
-    groups = 1;
-#endif
-
-    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize2int(input->dims()), groups);
-    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, framework::vectorize2int(output->dims()), groups);
-    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
-        layout, framework::vectorize2int(filter->dims()), groups);
-
-    int input_channels = input->dims()[1];
-    int input_height, input_width, input_depth;
-    if (input->dims().size() == 5) {
-      input_depth = input->dims()[2];
-      input_height = input->dims()[3];
-      input_width = input->dims()[4];
-    } else {  // dim size is enforced in InferShape
-      input_depth = 1;
-      input_height = input->dims()[2];
-      input_width = input->dims()[3];
-    }
-    int output_channels = filter->dims()[0];
-    int output_height, output_width, output_depth;
-    if (output->dims().size() == 5) {
-      output_depth = output->dims()[2];
-      output_height = output->dims()[3];
-      output_width = output->dims()[4];
-    } else {
-      output_depth = 1;
-      output_height = output->dims()[2];
-      output_width = output->dims()[3];
-    }
-
-    int group_offset_in =
-        input_channels / groups * input_height * input_width * input_depth;
-    int group_offset_out =
-        output_channels / groups * output_height * output_width * output_depth;
-    int group_offset_filter = filter->numel() / groups;
-    // ------------------- cudnn conv workspace ---------------------
-    void* cudnn_workspace = nullptr;
-    size_t workspace_size_in_bytes;  // final workspace to allocate.
-    size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
-    if (user_workspace_size > 0) {
-      workspace_size_limit = user_workspace_size * 1024 * 1024;
-    }
-    // ------------------- cudnn conv algorithm ---------------------
-    cudnnConvolutionFwdAlgo_t algo;
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto handle = dev_ctx.cudnn_handle();
-
-    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-        cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-        workspace_size_limit, &algo));
-    // get workspace size able to allocate
-    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
-        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-        cudnn_output_desc, algo, &workspace_size_in_bytes));
-    // Allocate on GPU memory
-    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
-    // ------------------- cudnn conv forward ---------------------
-    T alpha = 1.0f, beta = 0.0f;
-    for (int i = 0; i < groups; i++) {
-      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
-          handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
-          cudnn_filter_desc, filter_data + i * group_offset_filter,
-          cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
-          &beta, cudnn_output_desc, output_data + i * group_offset_out));
-    }
-    // Release the cudnn workspace
-    paddle::memory::Free(gpu, cudnn_workspace);
-  }
-};
-
-template <typename T>
-class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    auto input = ctx.Input<Tensor>("Input");
-    auto filter = ctx.Input<Tensor>("Filter");
-    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
-
-    const T* input_data = input->data<T>();
-    const T* output_grad_data = output_grad->data<T>();
-    const T* filter_data = filter->data<T>();
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    int64_t user_workspace_size =
-        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
-
-    // ------------------- cudnn descriptors ---------------------
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor output_grad_desc;
-
-    ScopedFilterDescriptor filter_desc;
-    ScopedFilterDescriptor filter_grad_desc;
-    ScopedConvolutionDescriptor conv_desc;
-    DataLayout layout = DataLayout::kNCHW;
-    if (input->dims().size() == 5) {
-      layout = DataLayout::kNCDHW;
-    }
-
-    cudnnConvolutionDescriptor_t cudnn_conv_desc =
-        conv_desc.descriptor<T>(paddings, strides, dilations);
-
-#if CUDNN_VERSION_MIN(7, 0, 1)
-    // cudnn 7 can support groups, no need to do it mannually
-    // FIXME(typhoonzero): find a better way to disable groups
-    // rather than setting it to 1.
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
-        cudnn_conv_desc, groups));
-    groups = 1;
-#endif
-
-    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize2int(input->dims()), groups);
-    cudnnTensorDescriptor_t cudnn_output_grad_desc =
-        output_grad_desc.descriptor<T>(
-            layout, framework::vectorize2int(output_grad->dims()), groups);
-    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
-        layout, framework::vectorize2int(filter->dims()), groups);
-
-    int input_channels = input->dims()[1];
-    int input_height, input_width, input_depth;
-    if (input->dims().size() == 5) {
-      input_depth = input->dims()[2];
-      input_height = input->dims()[3];
-      input_width = input->dims()[4];
-    } else {  // dim size is enforced in InferShape
-      input_depth = 1;
-      input_height = input->dims()[2];
-      input_width = input->dims()[3];
-    }
-
-    int output_grad_channels = filter->dims()[0];
-    int output_grad_height, output_grad_width, output_grad_depth;
-    if (input->dims().size() == 5) {
-      output_grad_depth = output_grad->dims()[2];
-      output_grad_height = output_grad->dims()[3];
-      output_grad_width = output_grad->dims()[4];
-    } else {
-      output_grad_depth = 1;
-      output_grad_height = output_grad->dims()[2];
-      output_grad_width = output_grad->dims()[3];
-    }
-
-    int group_offset_in =
-        input_channels / groups * input_height * input_width * input_depth;
-    int group_offset_out = output_grad_channels / groups * output_grad_height *
-                           output_grad_width * output_grad_depth;
-    int group_offset_filter = filter->numel() / groups;
-    // ------------------- cudnn backward algorithm ---------------------
-    cudnnConvolutionBwdDataAlgo_t data_algo;
-    cudnnConvolutionBwdFilterAlgo_t filter_algo;
-    size_t workspace_size_in_bytes = 0, tmp_size = 0;
-    size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
-    if (user_workspace_size > 0) {
-      workspace_size_limit = user_workspace_size * 1024 * 1024;
-    }
-
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto handle = dev_ctx.cudnn_handle();
-    if (input_grad) {
-      PADDLE_ENFORCE(
-          platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-              handle, cudnn_filter_desc,
-              // dyDesc: Handle to the previously initialized input differential
-              // tensor descriptor.
-              cudnn_output_grad_desc, cudnn_conv_desc,
-              // dxDesc: Handle to the previously initialized output tensor
-              // descriptor.
-              cudnn_input_desc,
-              CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit, &data_algo));
-      PADDLE_ENFORCE(
-          platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
-              handle, cudnn_filter_desc, cudnn_output_grad_desc,
-              cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
-      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
-    }
-
-    if (filter_grad) {
-      PADDLE_ENFORCE(
-          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-              handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
-              cudnn_filter_desc,
-              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit, &filter_algo));
-
-      PADDLE_ENFORCE(
-          platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-              handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
-              cudnn_filter_desc, filter_algo, &tmp_size));
-      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
-    }
-    // ------------------- cudnn conv workspace ---------------------
-    // Already on GPU
-    void* cudnn_workspace = nullptr;
-    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
-    // ------------------- cudnn conv backward data ---------------------
-    T alpha = 1.0f, beta = 0.0f;
-    if (input_grad) {
-      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      // Because beta is zero, it is unnecessary to reset input_grad.
-
-      for (int i = 0; i < groups; i++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
-            handle, &alpha, cudnn_filter_desc,
-            filter_data + i * group_offset_filter, cudnn_output_grad_desc,
-            output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
-            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
-            input_grad_data + i * group_offset_in));
-      }
-    }
-    // ------------------- cudnn conv backward filter ---------------------
-    if (filter_grad) {
-      T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-      // Because beta is zero, it is unnecessary to reset filter_grad.
-      for (int i = 0; i < groups; i++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
-            handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
-            cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
-            cudnn_conv_desc, filter_algo, cudnn_workspace,
-            workspace_size_in_bytes, &beta, cudnn_filter_desc,
-            filter_grad_data + i * group_offset_filter));
-      }
-    }
-    // Release the cudnn workspace
-    paddle::memory::Free(gpu, cudnn_workspace);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<double>);
-REGISTER_OP_KERNEL(conv2d_grad, CUDNN, ::paddle::platform::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>);
-
-REGISTER_OP_KERNEL(conv3d, CUDNN, ::paddle::platform::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<double>);
-REGISTER_OP_KERNEL(conv3d_grad, CUDNN, ::paddle::platform::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>);
diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
deleted file mode 100644
index cef7ddd5fe7e12a374fb9cc79211bd2eb97c6c52..0000000000000000000000000000000000000000
--- a/paddle/operators/conv_op.cc
+++ /dev/null
@@ -1,354 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/conv_op.h"
-
-namespace paddle {
-namespace operators {
-
-void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("Input"),
-                 "Input(Input) of ConvOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                 "Input(Filter) of ConvOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Output"),
-                 "Output(Output) of ConvOp should not be null.");
-
-  auto in_dims = ctx->GetInputDim("Input");
-  auto filter_dims = ctx->GetInputDim("Filter");
-  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-  int groups = ctx->Attrs().Get<int>("groups");
-  std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
-
-  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
-                 "Conv intput should be 4-D or 5-D tensor.");
-  PADDLE_ENFORCE_EQ(
-      in_dims.size(), filter_dims.size(),
-      "Conv input dimension and filter dimension should be the same.");
-  PADDLE_ENFORCE(
-      in_dims.size() - strides.size() == 2U,
-      "Conv input dimension and strides dimension should be consistent.");
-  PADDLE_ENFORCE_EQ(
-      paddings.size(), strides.size(),
-      "Conv paddings dimension and Conv strides dimension should be the same.");
-
-  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[1] * groups,
-                    "The number of input channels should be equal to filter "
-                    "channels * groups.");
-
-  PADDLE_ENFORCE_EQ(
-      filter_dims[0] % groups, 0,
-      "The number of output channels should be divided by groups.");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    PADDLE_ENFORCE(in_dims[i + 2] + 2 * paddings[i] -
-                           (dilations[i] * (filter_dims[i + 2] - 1) + 1) >
-                       0,
-                   "Due to the settings of paddings, filter_dims and "
-                   "dilations, the output size is less than 0, please check "
-                   "again.");
-    output_shape.push_back(OutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                      dilations[i], paddings[i], strides[i]));
-  }
-  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
-  ctx->ShareLoD("Input", "Output");
-}
-
-framework::OpKernelType ConvOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
-#ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
-  }
-#endif
-  framework::LibraryType library_;
-  if (use_cudnn) {
-    library_ = framework::LibraryType::kCUDNN;
-  } else {
-    library_ = framework::LibraryType::kPlain;
-  }
-
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout_, library_);
-}
-
-Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
-  AddInput(
-      "Input",
-      "(Tensor) The input tensor of convolution operator. "
-      "The format of input tensor is NCHW, where N is batch size, C is the "
-      "number of channels, H is the height of the feature, "
-      "and W is the width of the feature.");
-  AddInput("Filter",
-           "(Tensor) The filter tensor of convolution operator. "
-           "The format of the filter tensor is MCHW, where M is the number of "
-           "output image channels, C is the number of input image channels, "
-           "H is the height of the filter, and W is the width of the filter. "
-           "If the groups attribute is greater than 1, C equals the number of "
-           "input image channels divided by the groups.");
-  AddOutput("Output",
-            "(Tensor) The output tensor of convolution operator. "
-            "The format of output tensor is also NCHW.");
-  AddAttr<std::vector<int>>("strides",
-                            "(vector<int> default:{1, 1}), the "
-                            "strides(h_stride, w_stride) of "
-                            "convolution operator.")
-      .SetDefault({1, 1});
-  AddAttr<std::vector<int>>("paddings",
-                            "(vector<int> default:{0, 0}), the "
-                            "paddings(h_pad, w_pad) of "
-                            "convolution operator.")
-      .SetDefault({0, 0});
-  AddAttr<int>(
-      "groups",
-      "(int default:1), the groups number of the convolution operator. "
-      "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
-      "when group=2, the first half of the filters is only connected to the "
-      "first half of the input channels, while the second half of the filters "
-      "is only connected to the second half of the input channels.")
-      .SetDefault(1);
-  AddAttr<std::vector<int>>("dilations",
-                            "(vector<int> default:{1, 1}), the "
-                            "dilations(h_dilation, w_dilation) of "
-                            "convolution operator.")
-      .SetDefault({1, 1});
-  AddAttr<bool>(
-      "use_cudnn",
-      "(bool, default false) Only used in cudnn kernel, need install cudnn")
-      .SetDefault(false);
-  AddAttr<std::string>(
-      "data_format",
-      "(string, default NCHW) Only used in "
-      "An optional string from: \"NHWC\", \"NCHW\". "
-      "Defaults to \"NHWC\". Specify the data format of the output data, "
-      "the input will be transformed automatically. ")
-      .SetDefault("AnyLayout");
-  // TODO(dzhwinter): need to registered layout transform function
-  AddAttr<int>("workspace_size_MB",
-               "Only used in cudnn kernel. Need set use_cudnn to true."
-               "workspace size for cudnn, in MB, "
-               "workspace is a section of GPU memory which will be "
-               "allocated/freed each time the operator runs, larger "
-               "workspace size can increase performance but also requires "
-               "better hardware. This size should be chosen carefully.")
-      .SetDefault(4096);
-  AddComment(R"DOC(
-Convolution Operator.
-
-The convolution operation calculates the output based on the input, filter
-and strides, paddings, dilations, groups parameters. The size of each dimension of the
-parameters is checked in the infer-shape.
-Input(Input) and Output(Output) are in NCHW format. Where N is batch
-size, C is the number of channels, H is the height of the feature, and W is
-the width of the feature.
-Filters(Input) is MCHW format. Where M is the number of output image channels, C is
-the number of input image channels, H is the height of the filter, and W
-is the width of the filter.
-Parameters(strides, paddings, dilations) are two elements. These two elements represent
-height and width, respectively.
-The input(X) size and output(Out) size may be different.
-
-Example:
-  Input:
-       Input shape: $(N, C_{in}, H_{in}, W_{in})$
-       Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
-  Output:
-       Output shape: $(N, C_{out}, H_{out}, W_{out})$
-  Where
-$$
-       H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\
-       W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1
-$$
-)DOC");
-}
-
-Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
-  AddInput(
-      "Input",
-      "(Tensor) The input tensor of convolution operator. "
-      "The format of input tensor is NCDHW. Where N is batch size, C is the "
-      "number of channels, D is the depth of the feature, H is the height of "
-      "the feature, "
-      "and W is the width of the feature.");
-  AddInput("Filter",
-           "(Tensor) The filter tensor of convolution operator. "
-           "The format of the filter tensor is MCDHW, where M is the number of "
-           "output image channels, C is the number of input image channels, "
-           "D is the depth of the filter, H is the height of the filter, and W "
-           "is the width of the filter."
-           "If the groups attribute is greater than 1, C equals the number of "
-           "input image channels divided by the groups.");
-  AddOutput("Output",
-            "(Tensor) The output tensor of convolution operator."
-            "The format of output tensor is also NCDHW.");
-  AddAttr<std::vector<int>>("strides",
-                            "(vector<int>, default:{1, 1, 1}), the "
-                            "strides(d_stride, h_stride, w_stride) of "
-                            "convolution operator.")
-      .SetDefault({1, 1, 1});
-  AddAttr<std::vector<int>>("paddings",
-                            "(vector<int>, default:{0, 0, 0}), the "
-                            "paddings(d_pad, h_pad, w_pad) of convolution "
-                            "operator.")
-      .SetDefault({0, 0, 0});
-  AddAttr<int>(
-      "groups",
-      "(int default:1), the groups number of the convolution operator. "
-      "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
-      "when group=2, the first half of the filters is only connected to the "
-      "first half of the input channels, while the second half of the filters "
-      "is only connected to the second half of the input channels.")
-      .SetDefault(1);
-  AddAttr<std::vector<int>>("dilations",
-                            "(vector<int> default:{1, 1, 1}), the "
-                            "dilations(d_dilation, h_dilation, w_dilation) of "
-                            "convolution operator.")
-      .SetDefault({1, 1, 1});
-  AddAttr<bool>(
-      "use_cudnn",
-      "(bool, default false) Only used in cudnn kernel, need install cudnn")
-      .SetDefault(false);
-  AddAttr<std::string>(
-      "data_format",
-      "(string, default NCHW) Only used in "
-      "An optional string from: \"NHWC\", \"NCHW\". "
-      "Defaults to \"NHWC\". Specify the data format of the output data, "
-      "the input will be transformed automatically. ")
-      .SetDefault("AnyLayout");
-  // TODO(dzhwinter): need to registered layout transform function
-  AddAttr<int>("workspace_size_MB",
-               "Only used in cudnn kernel. workspace size for cudnn, in MB, "
-               "workspace is a section of GPU memory which will be "
-               "allocated/freed each time the operator runs, larger "
-               "workspace size can increase performance but also requires "
-               "better hardware. This size should be chosen carefully.")
-      .SetDefault(4096);
-
-  AddComment(R"DOC(
-Convolution3D Operator.
-
-The convolution operation calculates the output based on the input, filter
-and strides, paddings, dilations, groups parameters. The size of each dimension of the
-parameters is checked in the infer-shape.
-Input(Input) and output(Output) are in NCDHW format, where N is batch
-size, C is the number of channels,D is the depth of the feature, H is the height of
-the feature, and W is the width of the feature.
-Filters(Input) is MCDHW format, where M is the number of output image channels,
-C is the number of input image channels, D is the depth of the filter,
-H is the height of the filter, and W is the width of the filter.
-Parameters(strides, paddings, dilations) are three elements. These three elements
-represent depth, height and width, respectively.
-The input(X) size and output(Out) size may be different.
-
-Example:
-  Input:
-       Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
-       Filter shape: $(C_{out}, C_{in}, D_f, H_f, W_f)$
-  Output:
-       Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
-  Where
-  $$
-       D_{out}= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{ strides[0]}+ 1 \\
-       H_{out}= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{ strides[1]}+ 1 \\
-       W_{out}= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{ strides[2]}+ 1
-  $$
-)DOC");
-}
-
-void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  auto in_dims = ctx->GetInputDim("Input");
-  auto filter_dims = ctx->GetInputDim("Filter");
-  if (ctx->HasOutput(framework::GradVarName("Input"))) {
-    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
-  }
-  if (ctx->HasOutput(framework::GradVarName("Filter"))) {
-    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
-  }
-}
-
-framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
-#ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
-  }
-#endif
-
-  framework::LibraryType library_;
-  if (use_cudnn) {
-    library_ = framework::LibraryType::kCUDNN;
-  } else {
-    library_ = framework::LibraryType::kPlain;
-  }
-
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout_, library_);
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
-            ops::ConvOpGrad);
-
-// depthwise convolution op
-REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
-            depthwise_conv2d_grad, ops::ConvOpGrad);
-REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
-            ops::ConvOpGrad);
-
-// depthwise conv kernel
-// TODO(xingzhaolong): neon kernel for mobile
-REGISTER_OP_CPU_KERNEL(
-    depthwise_conv2d,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    depthwise_conv2d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    conv3d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv3d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc
deleted file mode 100644
index d0bd40ee95dab3b2589742b8a0c3a5de7918b5b9..0000000000000000000000000000000000000000
--- a/paddle/operators/conv_op.cu.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/conv_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d,
-    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d_grad,
-    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv2d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    conv3d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv3d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h
deleted file mode 100644
index 3c1d0e9c1c4bb964bfaebc3bfed115548bd53f97..0000000000000000000000000000000000000000
--- a/paddle/operators/conv_op.h
+++ /dev/null
@@ -1,422 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/depthwise_conv.h"
-#include "paddle/operators/math/im2col.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/vol2col.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-// Base convolution operator definations for other conv
-// like operators to reuse the implementation.
-inline int OutputSize(int input_size, int filter_size, int dilation,
-                      int padding, int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  const int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  return output_size;
-}
-inline bool IsExpand(std::vector<int64_t>& filter_dim,
-                     std::vector<int>& strides, std::vector<int>& paddings,
-                     std::vector<int>& dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
-
-// Define Op classes in .h file so that other conv
-// operator implementations can reuse the code.
-class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker);
-};
-
-class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker);
-};
-
-class ConvOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class ConvOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class GemmConvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    // The filter will be reshaped in the calculations,
-    // so here use an assignment operation,
-    // that avoids modifying the variable in the Scope.
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-
-    int groups = context.Attr<int>("groups");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
-    std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-
-    // use col_shape in the im2col calculation
-    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
-    // o_h, o_w}
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = input->dims()[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-    // use col_matrix_shape in the gemm calculation
-    // size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d *
-    // o_h * o_w)
-    framework::DDim col_matrix_shape =
-        framework::flatten_to_2d(col_shape, data_dim + 1);
-
-    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-    Tensor col;
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix;
-    if (is_expand) {
-      col.mutable_data<T>(col_shape, context.GetPlace());
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-
-    framework::DDim input_shape = framework::slice_ddim(
-        input->dims(), 1, static_cast<int>(input->dims().size()));
-
-    framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                           filter.numel() / filter.dims()[0]};
-    filter.Resize(filter_matrix_shape);
-
-    framework::DDim output_matrix_shape = {
-        output->dims()[1],
-        output->numel() / (output->dims()[0] * output->dims()[1])};
-
-    // convolution operator: im2col(or vol2col) + gemm
-    int in_step = static_cast<int>(input->dims()[1]) / groups;
-    int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-    math::Vol2ColFunctor<DeviceContext, T> vol2col;
-    math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-      Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-      for (int g = 0; g < groups; g++) {
-        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-        if (!is_expand) {
-          col.ShareDataWith(in_slice);
-          col_matrix.ShareDataWith(col);
-          col_matrix.Resize(col_matrix_shape);
-        } else if (data_dim == 2U) {
-          // im2col
-          im2col(dev_ctx, in_slice, dilations, strides,
-                 std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                  paddings[1]},
-                 &col);
-        } else if (data_dim == 3U) {
-          // vol2col
-          vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
-        }
-
-        // gemm
-        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-        Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-        math::matmul<DeviceContext, T>(dev_ctx, filter_slice, false, col_matrix,
-                                       false, T(1.0), &out_slice, T(0.0));
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GemmConvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    // The filter and filter_grad will be reshaped in the calculations,
-    // so here use an assignment operation,
-    // that avoids modifying the variable in the Scope.
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    if (!input_grad && !filter_grad) return;
-
-    int groups = context.Attr<int>("groups");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
-    std::vector<int64_t> output_shape_vec(
-        framework::vectorize(output_grad->dims()));
-
-    // use col_shape in the im2col calculation
-    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
-    // o_h, o_w}
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = input->dims()[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-    // use col_matrix_shape in the gemm calculation
-    // size: (i_c/g * k_h * k_w, o_h * o_w)
-    // or
-    // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
-    framework::DDim col_matrix_shape =
-        framework::flatten_to_2d(col_shape, data_dim + 1);
-
-    framework::DDim input_shape = framework::slice_ddim(
-        input->dims(), 1, static_cast<int>(input->dims().size()));
-
-    framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                           filter.numel() / filter.dims()[0]};
-    filter.Resize(filter_matrix_shape);
-
-    framework::DDim output_matrix_shape = {
-        output_grad->dims()[1],
-        output_grad->numel() /
-            (output_grad->dims()[0] * output_grad->dims()[1])};
-
-    // convolution backward input operator:  gemm + col2im(or col2vol)
-    // convolution backward weight operator: im2col(or vol2col) + gemm
-    int in_step = static_cast<int>(input->dims()[1]) / groups;
-    int out_step = static_cast<int>(output_grad->dims()[1]) / groups;
-
-    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-    Tensor col;
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix;
-    if (is_expand) {
-      col.mutable_data<T>(col_shape, context.GetPlace());
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-
-    math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(context.GetPlace());
-
-      // if is_expand is false, the operation of set_zero is unnecessary,
-      // because math::matmul will reset input_grad.
-      if (is_expand) {
-        set_zero(dev_ctx, input_grad, static_cast<T>(0));
-      }
-      math::Col2VolFunctor<DeviceContext, T> col2vol;
-      math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
-
-      for (int i = 0; i < batch_size; i++) {
-        Tensor out_grad_batch =
-            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // gemm
-          Tensor out_grad_slice =
-              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-
-          Tensor in_grad_slice =
-              in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
-
-          if (!is_expand) {
-            col_matrix.ShareDataWith(in_grad_slice);
-            col_matrix.Resize(col_matrix_shape);
-          }
-          math::matmul<DeviceContext, T>(dev_ctx, filter_slice, true,
-                                         out_grad_slice, false, T(1.0),
-                                         &col_matrix, T(0.0));
-
-          if (is_expand && data_dim == 2U) {
-            col2im(dev_ctx, col, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                    paddings[1]},
-                   &in_grad_slice);
-          } else if (is_expand && data_dim == 3U) {
-            col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice);
-          }
-        }
-      }
-    }
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(context.GetPlace());
-      Tensor filter_grad_ = *filter_grad;
-      filter_grad_.Resize(filter_matrix_shape);
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      for (int i = 0; i < batch_size; i++) {
-        Tensor out_grad_batch =
-            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // im2col
-          Tensor out_grad_slice =
-              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-          if (!is_expand) {
-            col.ShareDataWith(in_slice);
-            col_matrix.ShareDataWith(col);
-            col_matrix.Resize(col_matrix_shape);
-          } else if (data_dim == 2U) {
-            im2col(dev_ctx, in_slice, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                    paddings[1]},
-                   &col);
-          } else if (data_dim == 3U) {
-            vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
-          }
-
-          // gemm
-          Tensor filter_grad_slice =
-              filter_grad_.Slice(g * out_step, (g + 1) * out_step);
-          math::matmul<DeviceContext, T>(dev_ctx, out_grad_slice, false,
-                                         col_matrix, true, T(1.0),
-                                         &filter_grad_slice, T(1.0));
-        }
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-
-    PADDLE_ENFORCE_EQ(
-        output->dims()[1] % input->dims()[1], 0,
-        "The output channels must be a multiple of the input channels");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-
-    math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    depthwiseConv(dev_ctx, *input, filter, strides, paddings, output);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    if (!input_grad && !filter_grad) return;
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-
-    math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
-        depthwiseConvInputGrad;
-    math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
-        depthwiseConvFilterGrad;
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, input_grad, static_cast<T>(0));
-      depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
-                             paddings, input_grad);
-    }
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-      depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
-                              filter_grad);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/conv_shift_op.cc b/paddle/operators/conv_shift_op.cc
deleted file mode 100644
index 106b68a0a0e787a0c9da2de924f4646c77b42b41..0000000000000000000000000000000000000000
--- a/paddle/operators/conv_shift_op.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/conv_shift_op.h"
-#include "paddle/framework/eigen.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-class ConvShiftOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(y_dims.size(), 2, "Input(Y)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
-                      "The 1st dimension of Input(X) and Input(Y) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(y_dims[1] % 2, 1,
-                      "The 2nd dimension of Input(Y) should be odd.");
-    PADDLE_ENFORCE_LE(y_dims[1], x_dims[1],
-                      "The 2nd dimension of Input(Y) should be less than or "
-                      "equal to the 2nd dimension of Input(X).");
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class ConvShiftGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should be not null.");
-
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      auto x_dims = ctx->GetInputDim("X");
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(y_grad_name)) {
-      auto y_dims = ctx->GetInputDim("Y");
-      ctx->SetOutputDim(y_grad_name, y_dims);
-    }
-  }
-};
-
-class ConvShiftOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ConvShiftOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "
-             "where B is the batch size and M is the data dimension.");
-    AddInput("Y",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape B x N, "
-             "where B is the batch size and N is the data dimension. N must "
-             "be odd.");
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "
-              "i.e., the same shape as X.");
-    AddComment(R"DOC(
-ConvShift Operator.
-
-A layer for circular convolution of two vectors,
-as used in the Neural Turing Machine: https://arxiv.org/abs/1410.5401
-
-The equation is:
-
-$$Out[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} X_{i+j} * Y_{j}$$
-
-where X's index is computed modulo M, and Y's index is computed modulo N.
-
-Both inputs X and Y can carry LoD (Level of Details) information.
-However, the output only shares the LoD information with input X.
-
-)DOC");
-  }
-};
-
-template <typename T>
-class ConvShiftKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *X = context.Input<Tensor>("X");
-    auto *Y = context.Input<Tensor>("Y");
-    auto *Out = context.Output<Tensor>("Out");
-    Out->mutable_data<T>(context.GetPlace());
-
-    auto x = EigenMatrix<T>::From(*X);
-    auto y = EigenMatrix<T>::From(*Y);
-    auto out = EigenMatrix<T>::From(*Out);
-    out.setZero();
-
-    size_t batch_size = X->dims()[0];
-    size_t x_width = X->dims()[1];
-    size_t y_width = Y->dims()[1];
-    size_t y_half_width = (y_width - 1) / 2;
-
-    for (size_t k = 0; k < batch_size; ++k) {
-      for (size_t i = 0; i < x_width; ++i) {
-        for (size_t j = 0; j < y_width; ++j) {
-          int index = (i + j - y_half_width + x_width) % x_width;
-          out(k, i) += x(k, index) * y(k, j);
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class ConvShiftGradKernel<platform::CPUPlace, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *X = context.Input<Tensor>("X");
-    auto *Y = context.Input<Tensor>("Y");
-    auto *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto *dX = context.Output<Tensor>(framework::GradVarName("X"));
-    auto *dY = context.Output<Tensor>(framework::GradVarName("Y"));
-
-    auto x = EigenMatrix<T>::From(*X);
-    auto y = EigenMatrix<T>::From(*Y);
-    auto dout = EigenMatrix<T>::From(*dOut);
-
-    auto x_dims = X->dims();
-    auto y_dims = Y->dims();
-    size_t batch_size = x_dims[0];
-    size_t x_width = x_dims[1];
-    size_t y_width = y_dims[1];
-    size_t y_half_width = (y_width - 1) / 2;
-
-    // The below trades code duplication for efficiency (keeping the if
-    // statement outside of the loop).
-    if (dX) {
-      dX->mutable_data<T>(context.GetPlace());
-      auto dx = EigenMatrix<T>::From(*dX);
-      dx.setZero();
-      for (size_t k = 0; k < batch_size; ++k) {
-        for (size_t i = 0; i < x_width; ++i) {
-          for (size_t j = 0; j < y_width; ++j) {
-            int index = (i + j - y_half_width + x_width) % x_width;
-            dx(k, index) += dout(k, i) * y(k, j);
-          }
-        }
-      }
-    }
-
-    if (dY) {
-      dY->mutable_data<T>(context.GetPlace());
-      auto dy = EigenMatrix<T>::From(*dY);
-      dy.setZero();
-      for (size_t k = 0; k < batch_size; ++k) {
-        for (size_t i = 0; i < x_width; ++i) {
-          for (size_t j = 0; j < y_width; ++j) {
-            int index = (i + j - y_half_width + x_width) % x_width;
-            dy(k, j) += x(k, index) * dout(k, i);
-          }
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker,
-            conv_shift_grad, ops::ConvShiftGradOp);
-REGISTER_OP_CPU_KERNEL(conv_shift,
-                       ops::ConvShiftKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    conv_shift_grad,
-    ops::ConvShiftGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/conv_shift_op.cu b/paddle/operators/conv_shift_op.cu
deleted file mode 100644
index cf7abc196e1293ab1b998d1a8cb9c361a7c2d427..0000000000000000000000000000000000000000
--- a/paddle/operators/conv_shift_op.cu
+++ /dev/null
@@ -1,197 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/conv_shift_op.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/platform/cuda_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-namespace {
-
-inline int DivUp(int x, int y) { return (x + y - 1) / y; }
-
-// Some notes on the design:
-//
-// Each thread is responsible for computing a single output out[k, i].
-// Thread blocks are based on tiles of x with height 1 in the batch dimension.
-//
-// This design is based on the typical use case where the filter
-// y is fairly small. For large y, it would probably be more efficient
-// to also tile across y.
-template <typename T>
-__global__ void ConvShiftForward(const T *x, const T *y, int x_width,
-                                 int y_width, int y_half_width, int batch_size,
-                                 T *out) {
-  extern __shared__ T mem[];
-
-  int tx = threadIdx.x;
-  int i = blockIdx.x * blockDim.x + tx;  // global x index
-  int k = blockIdx.y;                    // batch index
-
-  // Check if we are in a boundary block with fewer x's to process than
-  // blockDim.x.
-  int num_x =
-      (blockIdx.x == gridDim.x - 1) ? (x_width % blockDim.x) : blockDim.x;
-
-  T *sx = mem;
-  T *sx_pad = &mem[num_x];
-  T *sy = &mem[blockDim.x + y_width];
-
-  // Collaboratively load y[k, :] and length-y padding of x into shared memory.
-  int pad_start = blockIdx.x * blockDim.x + num_x + x_width - y_half_width;
-  for (int j = tx; j < y_width; j += blockDim.x) {
-    sy[j] = y[k * y_width + j];
-    sx_pad[j] = x[k * x_width + (pad_start + j) % x_width];
-  }
-
-  // Load a cyclically shifted slice of x into shared memory.
-  if (tx < num_x) {
-    int load_i = (i - y_half_width + x_width) % x_width;
-    sx[tx] = x[k * x_width + load_i];
-  }
-  __syncthreads();
-
-  if (tx < num_x) {
-    // Compute dot product of sx[tx:tx + y_width] and sy.
-    T sum = 0;
-    for (int j = 0; j < y_width; ++j) {
-      sum += sx[tx + j] * sy[j];
-    }
-
-    // Save to out[k, i].
-    out[k * x_width + i] = sum;
-  }
-}
-
-// Compute x gradient - initial naive implementation with atomic add.
-template <typename T>
-__global__ void ConvShiftGradX(const T *dout, const T *y, int x_width,
-                               int y_width, int y_half_width, int batch_size,
-                               T *dx) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;  // x index
-  int j = blockIdx.y;                             // y index
-  int k = blockIdx.z;                             // batch index
-
-  if (i < x_width) {
-    int index = (i + j - y_half_width + x_width) % x_width;
-    atomicAdd(&dx[k * x_width + index],
-              dout[k * x_width + i] * y[k * y_width + j]);
-  }
-}
-
-// Compute y gradient - initial naive implementation with atomic add.
-template <typename T>
-__global__ void ConvShiftDy(const T *x, const T *dout, int x_width, int y_width,
-                            int y_half_width, int batch_size, T *dy) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;  // x index
-  int j = blockIdx.y;                             // y index
-  int k = blockIdx.z;                             // batch index
-
-  if (i < x_width) {
-    int index = (i + j - y_half_width + x_width) % x_width;
-    atomicAdd(&dy[k * y_width + j],
-              x[k * x_width + index] * dout[k * x_width + i]);
-  }
-}
-}  // namespace
-
-template <typename T>
-class ConvShiftKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Y = context.Input<Tensor>("Y");
-    Tensor *Out = context.Output<Tensor>("Out");
-    const T *x_data = X->data<T>();
-    const T *y_data = Y->data<T>();
-    T *out_data = Out->mutable_data<T>(context.GetPlace());
-
-    int batch_size = X->dims()[0];
-    int x_width = X->dims()[1];
-    int y_width = Y->dims()[1];
-    int y_half_width = (y_width - 1) / 2;
-
-    const int x_per_block = 256;
-    int num_x_blocks = DivUp(x_width, x_per_block);
-    int mem_per_block = (x_per_block + 2 * y_width) * sizeof(T);
-
-    dim3 grid_dim(num_x_blocks, batch_size);
-
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-
-    ConvShiftForward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>(
-        x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data);
-  }
-};
-
-template <typename T>
-class ConvShiftGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Y = context.Input<Tensor>("Y");
-    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    const T *x_data = X->data<T>();
-    const T *y_data = Y->data<T>();
-    const T *dout_data = dOut->data<T>();
-
-    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
-    Tensor *dY = context.Output<Tensor>(framework::GradVarName("Y"));
-
-    int batch_size = X->dims()[0];
-    int x_width = X->dims()[1];
-    int y_width = Y->dims()[1];
-    int y_half_width = (y_width - 1) / 2;
-
-    auto &device_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    math::SetConstant<platform::CUDADeviceContext, T> zero;
-
-    const int x_per_block = 256;
-    int num_x_blocks = DivUp(x_width, x_per_block);
-    dim3 grid_dim(num_x_blocks, y_width, batch_size);
-
-    if (dX) {
-      T *dx_data = dX->mutable_data<T>(context.GetPlace());
-      zero(device_ctx, dX, static_cast<T>(0.0));
-      ConvShiftGradX<T><<<grid_dim, x_per_block, 0, device_ctx.stream()>>>(
-          dout_data, y_data, x_width, y_width, y_half_width, batch_size,
-          dx_data);
-    }
-    if (dY) {
-      T *dy_data = dY->mutable_data<T>(context.GetPlace());
-      zero(device_ctx, dY, static_cast<T>(0.0));
-      ConvShiftDy<T><<<grid_dim, x_per_block, 0, device_ctx.stream()>>>(
-          x_data, dout_data, x_width, y_width, y_half_width, batch_size,
-          dy_data);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    conv_shift,
-    ops::ConvShiftKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    conv_shift_grad,
-    ops::ConvShiftGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/conv_shift_op.h b/paddle/operators/conv_shift_op.h
deleted file mode 100644
index 6781d87ef0d99a0b0fc4747245920b6a38a33804..0000000000000000000000000000000000000000
--- a/paddle/operators/conv_shift_op.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ConvShiftKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override;
-};
-
-template <typename DeviceContext, typename T>
-class ConvShiftGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override;
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/conv_transpose_cudnn_op.cu.cc b/paddle/operators/conv_transpose_cudnn_op.cu.cc
deleted file mode 100644
index 23bc97e13c13e5c1d62a406be5e528ab272fa93a..0000000000000000000000000000000000000000
--- a/paddle/operators/conv_transpose_cudnn_op.cu.cc
+++ /dev/null
@@ -1,251 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/memory/memory.h"
-#include "paddle/operators/conv_transpose_op.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/cudnn_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
-using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
-using DataLayout = platform::DataLayout;
-
-static constexpr size_t kConvCUDNNWorkspaceLimitBytes = 1024 * 1024 * 1024;
-
-template <typename T>
-class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* filter = ctx.Input<Tensor>("Filter");
-    auto* output = ctx.Output<Tensor>("Output");
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    // cudnn v5 does not support dilations
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
-
-    const T* input_data = input->data<T>();
-    const T* filter_data = filter->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    // ------------------- cudnn descriptors ---------------------
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor output_desc;
-    ScopedFilterDescriptor filter_desc;
-    ScopedConvolutionDescriptor conv_desc;
-    DataLayout layout;
-
-    if (strides.size() == 2U) {
-      layout = DataLayout::kNCHW;
-    } else {
-      layout = DataLayout::kNCDHW;
-    }
-
-    // (N, M, H, W) or (N, M, D, H, W)
-    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize2int(input->dims()));
-    // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
-    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, framework::vectorize2int(output->dims()));
-    // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w)
-    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
-        layout, framework::vectorize2int(filter->dims()));
-    cudnnConvolutionDescriptor_t cudnn_conv_desc =
-        conv_desc.descriptor<T>(paddings, strides, dilations);
-
-    // ------------------- cudnn conv workspace ---------------------
-    void* cudnn_workspace = nullptr;
-    size_t workspace_size_in_bytes;  // final workspace to allocate.
-    size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
-    if (user_workspace_size > 0) {
-      workspace_size_limit = user_workspace_size * 1024 * 1024;
-    }
-    // ------------------- cudnn conv algorithm ---------------------
-    cudnnConvolutionBwdDataAlgo_t algo;
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto handle = dev_ctx.cudnn_handle();
-    // Get the algorithm
-    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-        handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
-        // dxDesc: Handle to the previously initialized output tensor
-        // descriptor.
-        cudnn_output_desc, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-        workspace_size_limit, &algo));
-
-    // get workspace size able to allocate
-    PADDLE_ENFORCE(
-        platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
-            handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
-            cudnn_output_desc, algo, &workspace_size_in_bytes));
-
-    // Allocate on GPU memory
-    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
-
-    // ------------------- cudnn conv transpose forward ---------------------
-    T alpha = 1.0f, beta = 0.0f;
-    PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
-        handle, &alpha, cudnn_filter_desc, filter_data, cudnn_input_desc,
-        input_data, cudnn_conv_desc, algo, cudnn_workspace,
-        workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
-
-    // Release the cudnn workspace
-    paddle::memory::Free(gpu, cudnn_workspace);
-  }
-};
-
-template <typename T>
-class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    auto input = ctx.Input<Tensor>("Input");
-    auto filter = ctx.Input<Tensor>("Filter");
-    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
-    const T* input_data = input->data<T>();
-    const T* output_grad_data = output_grad->data<T>();
-    const T* filter_data = filter->data<T>();
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    // cudnn v5 does not support dilations
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
-
-    // ------------------- cudnn descriptors ---------------------
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor output_desc;
-    ScopedFilterDescriptor filter_desc;
-    ScopedConvolutionDescriptor conv_desc;
-    DataLayout layout = DataLayout::kNCHW;
-
-    // Input: (N, M, H, W) or (N, M, D, H, W)
-    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize2int(input->dims()));
-    // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
-    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, framework::vectorize2int(output_grad->dims()));
-    // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w)
-    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
-        layout, framework::vectorize2int(filter->dims()));
-
-    cudnnConvolutionDescriptor_t cudnn_conv_desc =
-        conv_desc.descriptor<T>(paddings, strides, dilations);
-
-    // ------------------- cudnn backward algorithm ---------------------
-    cudnnConvolutionFwdAlgo_t data_algo;
-    cudnnConvolutionBwdFilterAlgo_t filter_algo;
-    size_t bwd_filter_ws_size, fwd_ws_size;
-    size_t workspace_size_in_bytes = 0;
-    size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
-    if (user_workspace_size > 0) {
-      workspace_size_limit = user_workspace_size * 1024 * 1024;
-    }
-
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto handle = dev_ctx.cudnn_handle();
-    if (input_grad) {
-      // choose backward algorithm for data
-      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-          handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
-          cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-          workspace_size_limit, &data_algo));
-      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
-          handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
-          cudnn_input_desc, data_algo, &fwd_ws_size));
-      workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size);
-    }
-
-    if (filter_grad) {
-      // choose backward algorithm for filter
-      PADDLE_ENFORCE(
-          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-              handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
-              cudnn_filter_desc,
-              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit, &filter_algo));
-
-      // get workspace for backwards filter algorithm
-      PADDLE_ENFORCE(
-          platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-              handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
-              cudnn_filter_desc, filter_algo, &bwd_filter_ws_size));
-      workspace_size_in_bytes =
-          std::max(workspace_size_in_bytes, bwd_filter_ws_size);
-    }
-
-    // ------------------- cudnn conv workspace ---------------------
-    // Already on GPU
-    void* cudnn_workspace = nullptr;
-    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
-    // ------------------- cudnn conv backward data ---------------------
-    // FIXME(typhoonzero): template type T may not be the same as cudnn call.
-    T alpha = 1.0f, beta = 0.0f;
-    if (input_grad) {
-      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      // Because beta is zero, it is unnecessary to reset input_grad.
-      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
-          handle, &alpha, cudnn_output_desc, output_grad_data,
-          cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo,
-          cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
-          input_grad_data));
-    }
-
-    // ------------------- cudnn conv backward filter ---------------------
-    if (filter_grad) {
-      T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-      // Because beta is zero, it is unnecessary to reset filter_grad.
-      // Gradient with respect to the filter
-      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
-          handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
-          input_data, cudnn_conv_desc, filter_algo, cudnn_workspace,
-          workspace_size_in_bytes, &beta, cudnn_filter_desc, filter_grad_data));
-    }
-    // Release the cudnn workspace
-    paddle::memory::Free(gpu, cudnn_workspace);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeOpKernel<float>,
-                   ops::CUDNNConvTransposeOpKernel<double>);
-REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeGradOpKernel<float>,
-                   ops::CUDNNConvTransposeGradOpKernel<double>);
-
-REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeOpKernel<float>,
-                   ops::CUDNNConvTransposeOpKernel<double>);
-REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeGradOpKernel<float>,
-                   ops::CUDNNConvTransposeGradOpKernel<double>);
diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc
deleted file mode 100644
index 089290a506db10f676c8d7eb92663d2cb56892af..0000000000000000000000000000000000000000
--- a/paddle/operators/conv_transpose_op.cc
+++ /dev/null
@@ -1,323 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/conv_transpose_op.h"
-
-namespace paddle {
-namespace operators {
-
-void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("Input"),
-                 "Input(Input) of ConvTransposeOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                 "Input(Filter) of ConvTransposeOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Output"),
-                 "Output(Output) of ConvTransposeOp should not be null.");
-
-  auto in_dims = ctx->GetInputDim("Input");
-  auto filter_dims = ctx->GetInputDim("Filter");
-  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-  std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
-
-  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
-                 "ConvTransposeOp intput should be 4-D or 5-D tensor.");
-  PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(),
-                    "ConvTransposeOp input dimension and filter dimension "
-                    "should be the same.");
-  PADDLE_ENFORCE(in_dims.size() - strides.size() == 2U,
-                 "ConvTransposeOp input dimension and strides dimension should "
-                 "be consistent.");
-  PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
-                    "ConvTransposeOp paddings dimension and strides "
-                    "dimension should be the same.");
-  PADDLE_ENFORCE_EQ(paddings.size(), dilations.size(),
-                    "ConvTransposeOp paddings dimension and dilations "
-                    "dimension should be the same.");
-  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
-                    "In ConvTransposeOp, The input channel should be the same "
-                    "as the number of filters.");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
-    output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] +
-                           filter_extent);
-  }
-  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
-}
-
-framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
-#ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
-  }
-#endif
-  framework::LibraryType library_;
-  if (use_cudnn) {
-    library_ = framework::LibraryType::kCUDNN;
-  } else {
-    library_ = framework::LibraryType::kPlain;
-  }
-
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout_, library_);
-}
-
-Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(OpProto* proto,
-                                               OpAttrChecker* op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
-  AddInput(
-      "Input",
-      "(Tensor) The input tensor of convolution transpose operator. "
-      "The format of input tensor is NCHW. Where N is batch size, C is the "
-      "number of input channels, H is the height of the feature, and "
-      "W is the width of the feature.");
-  AddInput(
-      "Filter",
-      "(Tensor) The filter tensor of convolution transpose operator. "
-      "The format of the filter tensor is MCHW, where M is the number of "
-      "input feature channels, C is the number of "
-      "output feature channels,"
-      "H is the height of the filter, and W is the width of the filter. "
-      "We enforce groups number == 1 in the convolution transpose scenario.");
-  AddOutput("Output",
-            "(Tensor) The output tensor of convolution transpose operator. "
-            "The format of output tensor is also NCHW.");
-
-  AddAttr<std::vector<int>>("dilations",
-                            "(vector<int> default:{1, 1}), the "
-                            "dilations(h_dilation, w_dilation) of convolution "
-                            "transpose operator.")
-      .SetDefault({1, 1});
-  AddAttr<std::vector<int>>(
-      "strides",
-      "(vector<int> default:{1, 1}), the strides(h_stride, w_stride) of "
-      "convolution transpose operator.")
-      .SetDefault({1, 1});
-  AddAttr<std::vector<int>>(
-      "paddings",
-      "(vector<int> default:{0, 0}), the paddings(h_pad, w_pad) of convolution "
-      "transpose operator.")
-      .SetDefault({0, 0});
-  AddAttr<bool>(
-      "use_cudnn",
-      "(bool, default false) Only used in cudnn kernel, need install cudnn")
-      .SetDefault(false);
-  AddAttr<std::string>(
-      "data_format",
-      "(string, default NCHW) Only used in "
-      "An optional string from: \"NHWC\", \"NCHW\". "
-      "Defaults to \"NHWC\". Specify the data format of the output data, "
-      "the input will be transformed automatically. ")
-      .SetDefault("AnyLayout");
-  // TODO(dzhwinter): need to registered layout transform function
-  AddAttr<int>("workspace_size_MB",
-               "Used in cudnn kernel only. workspace size for cudnn, in MB, "
-               "workspace is a section of GPU memory which will be "
-               "allocated/freed each time the operator runs, larger "
-               "workspace size can increase performance but also requires "
-               "better hardward. This size should be carefully setted.")
-      .SetDefault(4096);
-  AddComment(R"DOC(
-Convolution2D Transpose Operator.
-
-The convolution transpose operation calculates the output based on the input, filter
-and dilations, strides, paddings, groups parameters. The size of each dimension of the
-parameters is checked in the infer-shape.
-Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the
-number of channels, H is the height of the feature, and W is the width of the feature.
-Filter(Input) is in MCHW format. Where M is the number of input feature channels,
-C is the number of output feature channels, H is the height of the filter,
-and W is the width of the filter.
-Parameters(strides, paddings) are two elements. These two elements represent height
-and width, respectively.
-The input(X) size and output(Out) size may be different.
-
-Example:
-  Input:
-       Input shape: $(N, C_{in}, H_{in}, W_{in})$
-       Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
-  Output:
-       Output shape: $(N, C_{out}, H_{out}, W_{out})$
-  Where
-  $$
-       H_{out} = (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\
-       W_{out} = (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1
-  $$
-)DOC");
-}
-
-Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(OpProto* proto,
-                                               OpAttrChecker* op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
-  AddInput("Input",
-           "(Tensor) The input tensor of convolution transpose operator."
-           "The format of input tensor is NCDHW. Where N is batch size, C is "
-           "the number of channels, D is the depth of the feature, H is the "
-           "height of the feature, and "
-           "W is the width of the feature.");
-  AddInput("Filter",
-           "(Tensor) The filter tensor of convolution transpose operator."
-           "The format of the filter tensor is MCDHW, where M is the number of "
-           "input feature channels, C is the number of "
-           "output feature channels, D "
-           "is the depth of the filter, H is the height of the filter, and "
-           "W is the width of the filter."
-           "We enforce groups number == 1 and padding == 0 in "
-           "the convolution3d transpose scenario.");
-  AddOutput("Output",
-            "(Tensor) The output tensor of convolution transpose operator."
-            "The format of output tensor is also NCDHW."
-            "Where N is batch size, C is "
-            "the number of channels, D is the depth of the feature, H is the "
-            "height of the feature, and W is the width of the feature.");
-
-  AddAttr<std::vector<int>>(
-      "dilations",
-      "(vector<int> default:{1, 1, 1}), the "
-      "dilations(d_dilation,h_dilation, w_dilation) of convolution "
-      "transpose operator.")
-      .SetDefault({1, 1, 1});
-  AddAttr<std::vector<int>>("strides",
-                            "(vector<int> default:{1, 1, 1}), the "
-                            "strides{d_stride, h_stride, w_stride} of "
-                            "convolution transpose operator.")
-      .SetDefault({1, 1, 1});
-  AddAttr<std::vector<int>>("paddings",
-                            "(vector<int> default:{0, 0, 0}), paddings(d_pad, "
-                            "h_pad, w_pad) of convolution transpose operator.")
-      .SetDefault({0, 0, 0});
-  AddAttr<bool>(
-      "use_cudnn",
-      "(bool, default false) Only used in cudnn kernel, need install cudnn")
-      .SetDefault(false);
-  AddAttr<std::string>(
-      "data_format",
-      "(string, default NCHW) Only used in "
-      "An optional string from: \"NHWC\", \"NCHW\". "
-      "Defaults to \"NHWC\". Specify the data format of the output data, "
-      "the input will be transformed automatically. ")
-      .SetDefault("AnyLayout");
-  // TODO(dzhwinter): need to registered layout transform function
-  AddAttr<int>("workspace_size_MB",
-               "Used in cudnn kernel only. workspace size for cudnn, in MB, "
-               "workspace is a section of GPU memory which will be "
-               "allocated/freed each time the operator runs, larger "
-               "workspace size can increase performance but also requires "
-               "better hardward. This size should be carefully setted.")
-      .SetDefault(4096);
-  AddComment(R"DOC(
-Convolution3D Transpose Operator.
-
-The convolution transpose operation calculates the output based on the input, filter
-and dilations, strides, paddings, groups parameters. The size of each dimension of the
-parameters is checked in the infer-shape.
-Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the
-number of channels, D is the depth of the feature, H is the height of the feature,
-and W is the width of the feature.
-Filter(Input) is in MCDHW format. Where M is the number of input feature channels,
-C is the number of output feature channels, D is the depth of the filter,H is the
-height of the filter, and W is the width of the filter.
-Parameters(strides, paddings) are three elements. These three elements represent
-depth, height and width, respectively.
-The input(X) size and output(Out) size may be different.
-
-Example:   
-  Input:
-       Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
-       Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$
-  Output:
-       Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
-  Where
-  $$
-       D_{out} = (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\
-       H_{out} = (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\
-       W_{out} = (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1
-  $$
-)DOC");
-}
-
-void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  auto in_dims = ctx->GetInputDim("Input");
-  auto filter_dims = ctx->GetInputDim("Filter");
-  if (ctx->HasOutput(framework::GradVarName("Input"))) {
-    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
-  }
-  if (ctx->HasOutput(framework::GradVarName("Filter"))) {
-    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
-  }
-}
-
-framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
-#ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
-  }
-#endif
-  framework::LibraryType library_;
-  if (use_cudnn) {
-    library_ = framework::LibraryType::kCUDNN;
-  } else {
-    library_ = framework::LibraryType::kPlain;
-  }
-
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout_, library_);
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
-            conv2d_transpose_grad, ops::ConvTransposeOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
-                                     double>);
-
-REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
-            conv3d_transpose_grad, ops::ConvTransposeOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
-                                     double>);
diff --git a/paddle/operators/conv_transpose_op.cu.cc b/paddle/operators/conv_transpose_op.cu.cc
deleted file mode 100644
index f1d827c606283440debb9a0edb25168816a3a08c..0000000000000000000000000000000000000000
--- a/paddle/operators/conv_transpose_op.cu.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/conv_transpose_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
-                                     float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
-                                     double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
-                                     float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
-                                     double>);
diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h
deleted file mode 100644
index 8c0d57afcd21d8622fb6316f7b988d79a45b57fe..0000000000000000000000000000000000000000
--- a/paddle/operators/conv_transpose_op.h
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/im2col.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/vol2col.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-
-// Define Op classes in .h file so that other conv transpose
-// operator implementations can reuse the code.
-class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  Conv2DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker);
-};
-
-class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  Conv3DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker);
-};
-
-class ConvTransposeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class ConvTransposeOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class GemmConvTransposeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    // The filter will be reshaped, so it should not be constant pointer
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    // groups will alway be disabled in conv2dtranspose.
-
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    // input_shape_vec: {n, c, h, w} or {n, c, d, h, w}
-    std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
-    // filter_shape_vec: {k_o, k_c, k_h, k_w} or {k_o, k_c, k_d, k_h, k_w}
-    std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
-
-    // use col_shape in the im2col and col2im (or vol2col and col2vol)
-    // calculation
-    // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = output->dims()[1];
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
-    }
-    DDim col_shape(framework::make_ddim(col_shape_vec));
-
-    // use col_matrix_shape in the gemm calculation
-    // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
-    DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1);
-
-    Tensor col;
-    col.mutable_data<T>(col_shape, context.GetPlace());
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix;
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-
-    // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
-    DDim output_shape =
-        framework::slice_ddim(output->dims(), 1, output->dims().size());
-
-    // input matrix size: (m, h * w) or (m, d * h * w)
-    DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
-
-    // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
-    DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
-    filter.Resize(filter_matrix_shape);
-
-    output->mutable_data<T>(context.GetPlace());
-    math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    set_zero(dev_ctx, output, static_cast<T>(0));
-
-    math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
-    math::Col2VolFunctor<DeviceContext, T> col2vol;
-
-    // convolution transpose: gemm + col2im or col2vol (similar to conv-backward
-    // on input)
-    for (int i = 0; i < batch_size; i++) {
-      // batch with size (m, h * w) or (m, d * h * w)
-      Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
-
-      // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
-      Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
-
-      // col_matrix = filter * input_batch
-      // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
-      math::matmul<DeviceContext, T>(dev_ctx, filter, true, input_batch, false,
-                                     static_cast<T>(1.0), &col_matrix,
-                                     static_cast<T>(0.0));
-
-      if (data_dim == 2U) {
-        // col2im: col_matrix -> dy
-        // from (c * k_h * k_w, h * w) to (c, o_h, o_w)
-        col2im(dev_ctx, col, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &output_batch);
-      } else if (data_dim == 3U) {
-        // col2vol: col_matrix -> dy
-        // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
-        col2vol(dev_ctx, col, dilations, strides, paddings, &output_batch);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    // For filter, we do not use const pointer b/c we will do reshape,
-    // but we should avoid modifying its value.
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-
-    if ((!input_grad) && (!filter_grad)) return;
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    // input_shape_vec: {n, c, h, w} or {n, c, d, h, w}
-    std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
-    // filter_shape_vec: {k_o, k_c, k_h, k_w} or {k_o, k_c, k_d, k_h, k_w}
-    std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
-
-    // use col_shape in the im2col and col2im (or vol2col and col2vol)
-    // calculation
-    // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = output_grad->dims()[1];
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
-    }
-    DDim col_shape(framework::make_ddim(col_shape_vec));
-
-    // use col_matrix_shape in the gemm calculation
-    // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
-    DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1);
-
-    // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
-    DDim output_shape = framework::slice_ddim(output_grad->dims(), 1,
-                                              output_grad->dims().size());
-
-    // input matrix size: (m, h * w) or (m, d * h * w)
-    DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
-
-    // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
-    DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
-    filter.Resize(filter_matrix_shape);
-
-    // convolution transpose grad on input:
-    // im2col + gemm (similar to conv-forward)
-    // input need to compute gradient
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    if (input_grad || filter_grad) {
-      Tensor col;
-      col.mutable_data<T>(col_shape, context.GetPlace());
-      // col_matrix shares the same piece of data with col,
-      // but will be reshaped into a two-dimensional matrix shape
-      // to call the matrix multiplication interface.
-      Tensor col_matrix;
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-
-      Tensor filter_grad_;
-      math::SetConstant<DeviceContext, T> set_zero;
-
-      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-
-      if (input_grad) {
-        input_grad->mutable_data<T>(context.GetPlace());
-      }
-      if (filter_grad) {  // filter size (m, c, k_h, k_w)
-        filter_grad->mutable_data<T>(context.GetPlace());
-        set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-        filter_grad_ = *filter_grad;
-        filter_grad_.Resize(filter_matrix_shape);
-      }
-
-      for (int i = 0; i < batch_size; i++) {
-        // batch with size (c, o_h * o_w)
-        Tensor output_grad_batch =
-            output_grad->Slice(i, i + 1).Resize(output_shape);
-
-        if (data_dim == 2U) {
-          // im2col: dy -> col matrix
-          // from (c, o_h, o_w) to (c * k_h * k_w, h * w)
-          im2col(dev_ctx, output_grad_batch, dilations, strides,
-                 std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                  paddings[1]},
-                 &col);
-        } else if (data_dim == 3U) {
-          // vol2col: dy -> col_matrix
-          // from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w)
-          vol2col(dev_ctx, output_grad_batch, dilations, strides, paddings,
-                  &col);
-        }
-
-        if (input_grad) {
-          // batch with size (m, h, w)
-          Tensor input_grad_batch =
-              input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
-          // gemm: dx = filter * dy
-          // (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, h * w)
-          // or
-          // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
-          // d, h, w)
-          math::matmul<DeviceContext, T>(
-              dev_ctx, filter, false, col_matrix, false, static_cast<T>(1.0),
-              &input_grad_batch, static_cast<T>(0.0));
-        }
-        if (filter_grad) {
-          // input batch
-          Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
-          // gemm: d_filter = x * dy^T
-          // (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, k_h * k_w)
-          // or
-          // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
-          // k_h * k_w)
-          math::matmul<DeviceContext, T>(dev_ctx, in_batch, false, col_matrix,
-                                         true, static_cast<T>(1.0),
-                                         &filter_grad_, static_cast<T>(1.0));
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
deleted file mode 100644
index 9019a1edb379be4007e38d3c0dc71feae23ae4e8..0000000000000000000000000000000000000000
--- a/paddle/operators/cos_sim_op.cc
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/cos_sim_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class CosSimOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    // notnull check
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of CosSimOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"),
-                   "Input(Y) of CosSimOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of CosSimOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("XNorm"),
-                   "Output(XNorm) of CosSimOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("YNorm"),
-                   "Output(YNorm) of CosSimOp should not be null.");
-
-    // shape check
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
-                      "Ranks of Input(X) and Input(Y) must be equal.");
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "Rank of Input(X) must not be less than 2.");
-    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 1, x_dims.size()),
-                      framework::slice_ddim(y_dims, 1, y_dims.size()),
-                      "All dimensions except the 1st of Input(X) and Input(Y) "
-                      "must be equal.");
-    PADDLE_ENFORCE(x_dims[0] == y_dims[0] || y_dims[0] == 1,
-                   "The 1st dimension of Input(Y) must be equal to Input(X) or"
-                   " just 1 (which will be broadcasted to match Input(X)).");
-
-    // resize tensor
-    ctx->SetOutputDim("Out", {x_dims[0], 1});
-    ctx->SetOutputDim("XNorm", {x_dims[0], 1});
-    ctx->SetOutputDim("YNorm", {y_dims[0], 1});
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CosSimOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The 1st input of cos_sim op.");
-    AddInput("Y", "The 2nd input of cos_sim op.");
-    AddOutput("Out", "The output of cos_sim op.");
-    AddOutput("XNorm",
-              "Norm of the first input, reduced along the 1st "
-              "dimension.")
-        .AsIntermediate();
-    AddOutput("YNorm",
-              "Norm of the second input, reduced along the 1st "
-              "dimension.")
-        .AsIntermediate();
-
-    AddComment(R"DOC(
-Cosine Similarity Operator.
-
-$Out = X^T * Y / (\sqrt{X^T * X} * \sqrt{Y^T * Y})$
-
-The input X and Y must have the same shape, except that the 1st dimension
-of input Y could be just 1 (different from input X), which will be
-broadcasted to match the shape of input X before computing their cosine
-similarity.
-
-Both the input X and Y can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input X.
-
-)DOC");
-  }
-};
-
-class CosSimOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    // notnull check
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("XNorm"), "Input(XNorm) must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("YNorm"), "Input(YNorm) must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) must not be null.");
-
-    // shape check
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    auto xnorm_dims = ctx->GetInputDim("XNorm");
-    auto ynorm_dims = ctx->GetInputDim("YNorm");
-    auto out_dims = ctx->GetInputDim("Out");
-    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                      "Ranks of Input(X) and Input(Y) must be equal.");
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "Rank of Input(X) must not be less than 2.");
-    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 1, x_dims.size()),
-                      framework::slice_ddim(y_dims, 1, y_dims.size()),
-                      "All dimensions except the 1st of Input(X) and Input(Y) "
-                      "must be equal.");
-    PADDLE_ENFORCE(x_dims[0] == y_dims[0] || y_dims[0] == 1,
-                   "The 1st dimension of Input(Y) must be equal to Input(X) or"
-                   " just 1 (which will be broadcasted to match Input(X)).");
-    auto target_xnorm_dims = framework::make_ddim({x_dims[0], 1});
-    auto target_ynorm_dims = framework::make_ddim({y_dims[0], 1});
-    PADDLE_ENFORCE_EQ(xnorm_dims, target_xnorm_dims,
-                      "Shape of Input(XNorm) must be [X.Dim(0), 1].");
-    PADDLE_ENFORCE_EQ(ynorm_dims, target_ynorm_dims,
-                      "Shape of Input(YNorm) must be [Y.Dim(0), 1].");
-    PADDLE_ENFORCE_EQ(out_dims, target_xnorm_dims,
-                      "Shape of Input(Out) must be [X.Dim(0), 1].");
-    PADDLE_ENFORCE_EQ(out_grad_dims, target_xnorm_dims,
-                      "Shape of Input(Out@Grad) must be [X.Dim(0), 1].");
-
-    // resize tensor
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, y_dims);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad,
-            ops::CosSimOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    cos_sim, ops::CosSimKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    cos_sim_grad,
-    ops::CosSimGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/cos_sim_op.cu b/paddle/operators/cos_sim_op.cu
deleted file mode 100644
index 9e5d1b6e4f0b6e482edd96df93d535e05dba3bc6..0000000000000000000000000000000000000000
--- a/paddle/operators/cos_sim_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/cos_sim_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    cos_sim, ops::CosSimKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    cos_sim_grad,
-    ops::CosSimGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h
deleted file mode 100644
index eadcca55f9bfc3e59f329df8ff419ad4c5a29007..0000000000000000000000000000000000000000
--- a/paddle/operators/cos_sim_op.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/cos_sim_functor.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class CosSimKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // get Tensor
-    auto* in_x = context.Input<Tensor>("X");
-    auto* in_y = context.Input<Tensor>("Y");
-    auto* out_z = context.Output<Tensor>("Out");
-    auto* out_x_norm = context.Output<Tensor>("XNorm");
-    auto* out_y_norm = context.Output<Tensor>("YNorm");
-    out_z->mutable_data<T>(context.GetPlace());
-    out_x_norm->mutable_data<T>(context.GetPlace());
-    out_y_norm->mutable_data<T>(context.GetPlace());
-
-    int rows_x = in_x->dims()[0];
-    int rows_y = in_y->dims()[0];
-
-    int cols = framework::product(in_x->dims()) / rows_x;
-
-    if (rows_x == rows_y) {
-      math::CosSimFunctor<T, true> functor(
-          in_x->data<T>(), in_y->data<T>(), out_x_norm->data<T>(),
-          out_y_norm->data<T>(), out_z->data<T>(), cols);
-      platform::ForRange<DeviceContext> for_range(
-          static_cast<const DeviceContext&>(context.device_context()), rows_x);
-      for_range(functor);
-    } else {
-      math::CosSimFunctor<T, false> functor(
-          in_x->data<T>(), in_y->data<T>(), out_x_norm->data<T>(),
-          out_y_norm->data<T>(), out_z->data<T>(), cols);
-      platform::ForRange<DeviceContext> for_range(
-          static_cast<const DeviceContext&>(context.device_context()), rows_x);
-      for_range(functor);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CosSimGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // get Tensor
-    auto* in_x = context.Input<Tensor>("X");
-    auto* in_y = context.Input<Tensor>("Y");
-    auto* in_z = context.Input<Tensor>("Out");
-    auto* in_x_norm = context.Input<Tensor>("XNorm");
-    auto* in_y_norm = context.Input<Tensor>("YNorm");
-    auto* out_grad_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* out_grad_y = context.Output<Tensor>(framework::GradVarName("Y"));
-    auto* in_grad_z = context.Input<Tensor>(framework::GradVarName("Out"));
-
-    // compute gradident
-    int rows_x = in_x->dims()[0];
-    int rows_y = in_y->dims()[0];
-    int cols = framework::product(in_x->dims()) / rows_x;
-
-    if (rows_x == rows_y) {
-      if (out_grad_x) {
-        math::CosSimGradFunctor<T> functor(
-            in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
-            in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
-            out_grad_x->mutable_data<T>(context.GetPlace()), cols);
-        platform::ForRange<DeviceContext> for_range(
-            static_cast<const DeviceContext&>(context.device_context()),
-            rows_x);
-        for_range(functor);
-      }
-      if (out_grad_y) {
-        math::CosSimGradFunctor<T> functor(
-            in_y_norm->data<T>(), in_x_norm->data<T>(), in_y->data<T>(),
-            in_x->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
-            out_grad_y->mutable_data<T>(context.GetPlace()), cols);
-        platform::ForRange<DeviceContext> for_range(
-            static_cast<const DeviceContext&>(context.device_context()),
-            rows_x);
-        for_range(functor);
-      }
-    } else {
-      if (out_grad_x) {
-        math::CosSimDxFunctor<T> functor(
-            in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
-            in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
-            out_grad_x->mutable_data<T>(context.GetPlace()), cols);
-        platform::ForRange<DeviceContext> for_range(
-            static_cast<const DeviceContext&>(context.device_context()),
-            rows_x);
-        for_range(functor);
-      }
-      if (out_grad_y) {
-        out_grad_y->mutable_data<T>(context.GetPlace());
-        math::SetConstant<DeviceContext, T> set_zero;
-        auto& dev_ctx = context.template device_context<DeviceContext>();
-        set_zero(dev_ctx, out_grad_y, static_cast<T>(0));
-
-        math::CosSimDyFunctor<DeviceContext, T> functor;
-        functor(dev_ctx, in_x_norm->data<T>(), in_y_norm->data<T>(),
-                in_x->data<T>(), in_y->data<T>(), in_z->data<T>(),
-                in_grad_z->data<T>(), static_cast<size_t>(rows_x),
-                static_cast<size_t>(cols), out_grad_y->data<T>());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/create_reader_op.cc b/paddle/operators/create_reader_op.cc
deleted file mode 100644
index 5ba2a25ab4c679f638e928a9e04c20d683a93630..0000000000000000000000000000000000000000
--- a/paddle/operators/create_reader_op.cc
+++ /dev/null
@@ -1,205 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/reader.h"
-
-namespace paddle {
-namespace operators {
-
-static std::vector<framework::DDim> RestoreShapes(
-    const std::vector<int>& shape_concat, const std::vector<int>& ranks) {
-  std::vector<framework::DDim> res;
-  int offset = 0;
-  for (int len : ranks) {
-    auto start_it = shape_concat.begin() + offset;
-    auto end_it = start_it + len;
-    res.push_back(framework::make_ddim(std::vector<int>(start_it, end_it)));
-    offset += len;
-  }
-  return res;
-}
-
-// general infershape for file readers
-class CreateFileReaderInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "The output file reader should not be null.");
-    const auto shape_concat =
-        ctx->Attrs().Get<std::vector<int>>("shape_concat");
-    const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
-    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
-    ctx->SetReaderDims("Out", shapes);
-  }
-};
-
-// general infershape for decorated readers
-class CreateDecoratedReaderInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"),
-                   "Input(UnderlyingReader) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "The output decorated reader should not be null.");
-    ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader"));
-  }
-};
-
-// general var type inference for all readers
-class CreateReaderInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    std::string reader_name = op_desc.Output("Out")[0];
-    framework::VarDesc* reader = block->FindVarRecursive(reader_name);
-    reader->SetType(framework::proto::VarDesc::READER);
-  }
-};
-
-template <typename T>
-class CreateRandomDataGeneratorOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
-    const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
-    const auto& ranks = Attr<std::vector<int>>("ranks");
-    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
-    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
-                      int(shape_concat.size()),
-                      "The accumulate of all ranks should be equal to the "
-                      "shape concat's length.");
-    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new framework::RandomDataGenerator<T>(shapes, Attr<float>("min"),
-                                                     Attr<float>("max")));
-  }
-};
-
-class CreateRandomDataGeneratorOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(op_proto, op_checker) {
-    AddOutput("Out", "(ReaderHolder) The created random reader.");
-    AddAttr<std::vector<int>>("shape_concat",
-                              "The concat of all data's shapes.");
-    AddAttr<std::vector<int>>(
-        "ranks",
-        "The ranks of each data."
-        "e.g."
-        "shape_concat = [2,3,4,5,6]"
-        "ranks = [3,2]"
-        "It means the reader will generate two data each time,"
-        "whose shapes are [2,3,4] and [5,6] respectively.");
-    AddAttr<float>("min", "The lower bound of reader's uniform distribution.");
-    AddAttr<float>("max", "The upper bound of reader's uniform distribution.");
-    AddComment(R"DOC(
-      CreateRandomDataGenerator Operator
-
-      This Op creates a random reader. 
-      The reader generates random data instead of really reading from files.
-      Generated data follow an uniform distribution between 'min' and 'max'.
-    )DOC");
-  }
-};
-
-class CreateShuffleReaderOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new framework::ShuffleReader(underlying_reader.Get(),
-                                            Attr<int>("buffer_size")));
-  }
-};
-
-class CreateShuffleReaderOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(op_proto, op_checker) {
-    AddInput(
-        "UnderlyingReader",
-        "(ReaderHolder) The underlying reader for creating a shuffle reader.");
-    AddOutput("Out", "(ReaderHolder) The created shuffle reader.");
-    AddAttr<int>("buffer_size", "The shuffle buffer size.").GreaterThan(0);
-    AddComment(R"DOC(
-      CreateShuffleReader Operator
-
-      A shuffle reader takes another reader as its 'underlying reader'
-      and yields the underlying reader's outputs in a shuffled order. 
-    )DOC");
-  }
-};
-
-class CreateBatchReaderOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new framework::BatchReader(underlying_reader.Get(),
-                                          Attr<int>("batch_size")));
-  }
-};
-
-class CreateBatchReaderOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(op_proto, op_checker) {
-    AddInput(
-        "UnderlyingReader",
-        "(ReaderHolder) The underlying reader for creating a batch reader.");
-    AddOutput("Out", "(ReaderHolder) The created batch reader.");
-    AddAttr<int>("batch_size",
-                 "How many instances the batch reader yields each time.")
-        .GreaterThan(0);
-    AddComment(R"DOC(
-      CreateBatchReader Operator
-
-      A batch reader takes another reader as its 'underlying reader', 
-      gathers the underlying reader's outputs and then yields them in batches. 
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(create_random_data_generator,
-                  ops::CreateRandomDataGeneratorOp<float>,
-                  ops::CreateFileReaderInferShape,
-                  ops::CreateRandomDataGeneratorOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::CreateReaderInferVarType);
-REGISTER_OPERATOR(create_shuffle_reader, ops::CreateShuffleReaderOp,
-                  ops::CreateDecoratedReaderInferShape,
-                  ops::CreateShuffleReaderOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::CreateReaderInferVarType);
-REGISTER_OPERATOR(create_batch_reader, ops::CreateBatchReaderOp,
-                  ops::CreateDecoratedReaderInferShape,
-                  ops::CreateBatchReaderOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::CreateReaderInferVarType);
diff --git a/paddle/operators/crf_decoding_op.cc b/paddle/operators/crf_decoding_op.cc
deleted file mode 100644
index 30626028c137a4ac3acf67f37ba5c5bb75215b3a..0000000000000000000000000000000000000000
--- a/paddle/operators/crf_decoding_op.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/crf_decoding_op.h"
-
-namespace paddle {
-namespace operators {
-class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CRFDecodingOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Emission",
-             "(LoDTensor, default: LoDTensor<float>). A LoDTensor with shape "
-             "[N x D] where N is the size of the mini-batch and D is the total "
-             "tag number. This input is the unscaled emission weight matrix of "
-             "the linear_chain_crf operator.");
-    AddInput(
-        "Transition",
-        "(Tensor, default: Tensor<float>). A Tensor with shape [(D + 2) x D]. "
-        "This input is the transition weights learned by the linear_chain_crf "
-        "operator, denoted as w. The 1st row of w are transition weights for "
-        "the start mask. The 2nd row of w are transition weights for the end "
-        "mask. Transition weights between other tags begin from the 3rd row of "
-        "w. See more details in comments of the linear_chain_crf operator.");
-    AddInput(
-        "Label",
-        "(LoDTensor,  LoDTensor<int64_t>). The ground truth with shape "
-        "[N x 1]. This input is optional. See more details in the operator's "
-        "comments.")
-        .AsDispensable();
-    AddOutput(
-        "ViterbiPath",
-        "(LoDTensor, LoDTensor<int64_t>). The decoding results. What to "
-        "return changes depending on whether the Input(Label) (the ground "
-        "truth) is given. See more details in the operator's comment.");
-    AddComment(R"DOC(
-The crf_decoding operator reads the emission feature weights and the transition
-feature weights learned by the linear_chain_crf operator. It implements the
-Viterbi algorithm which is a dynamic programming algorithm for finding the most
-likely sequence of hidden states, called the Viterbi path, that results in a
-sequence of observed tags.
-
-The output of this operator changes according to whether Input(Label) is given:
-
-1. Input(Label) is given:
-
-This happens in training. This operator is used to co-work with the chunk_eval
-operator.
-
-When Input(Label) is given, the crf_decoding operator returns a row vector
-with shape [N x 1] whose values are fixed to be 0, indicating an incorrect
-prediction, or 1 indicating a tag is correctly predicted. Such an output is the
-input to chunk_eval operator.
-
-2. Input(Label) is not given:
-
-This is the standard decoding process.
-
-The crf_decoding operator returns a row vector with shape [N x 1] whose values
-range from 0 to maximum tag number - 1. Each element indicates an index of a
-predicted tag.
-)DOC");
-  }
-};
-
-class CRFDecodingOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Emission"),
-                   "Input(Emission) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Transition"),
-                   "Input(Transition) should be not null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("ViterbiPath"),
-                   "Output(ViterbiPath) should be not null.");
-
-    auto emission_dims = ctx->GetInputDim("Emission");
-    PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL,
-                      "The Input(Emission) should be a 2-D tensor.");
-    PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed.");
-
-    auto transition_dims = ctx->GetInputDim("Transition");
-    PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL,
-                      "The Input(Transition) should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(
-        transition_dims[0] - 2, transition_dims[1],
-        "An invalid dimension for the Input(Transition), which should "
-        "be a 2-D tensor with shape [(D + 2) x D].");
-    PADDLE_ENFORCE_EQ(
-        emission_dims[1], transition_dims[1],
-        "The 2nd dimension of the Input(Emission) and the Input(Transition) "
-        "should be equal to the tag number.");
-
-    if (ctx->HasInput("Label")) {
-      auto label_dims = ctx->GetInputDim("Label");
-      PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
-                     "The Input(Label) should be a 2-D tensor with the 2nd "
-                     "dimensions fixed to 1.");
-      PADDLE_ENFORCE_EQ(
-          emission_dims[0], label_dims[0],
-          "The height of Input(Emission) and the height of Input(Label) "
-          "should be the same.");
-    }
-
-    ctx->ShareLoD("Emission", /*->*/ "ViterbiPath");
-    ctx->SetOutputDim("ViterbiPath", {emission_dims[0], 1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
-        platform::CPUPlace());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(crf_decoding, ops::CRFDecodingOp,
-                             ops::CRFDecodingOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    crf_decoding,
-    ops::CRFDecodingOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CRFDecodingOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/crf_decoding_op.h b/paddle/operators/crf_decoding_op.h
deleted file mode 100644
index ce2f4e6622c21e1e5383b4a3aefc2987bf155aec..0000000000000000000000000000000000000000
--- a/paddle/operators/crf_decoding_op.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::LoDTensor;
-using framework::LoD;
-using framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class CRFDecodingOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* emission_weights = ctx.Input<LoDTensor>("Emission");
-    auto* transition_weights = ctx.Input<Tensor>("Transition");
-    auto* label = ctx.Input<LoDTensor>("Label");
-    auto* decoded_path = ctx.Output<Tensor>("ViterbiPath");
-
-    PADDLE_ENFORCE_EQ(emission_weights->NumLevels(), 1UL,
-                      "The Input(Emission) should be a sequence.");
-    auto lod = emission_weights->lod();
-    PADDLE_ENFORCE(lod.size(), "Input(Emission) must be a sequence.");
-    const size_t level = 0;
-    const size_t seq_num = lod[level].size() - 1;
-
-    int64_t* path = decoded_path->mutable_data<int64_t>(platform::CPUPlace());
-    math::SetConstant<DeviceContext, int64_t>()(
-        ctx.template device_context<DeviceContext>(), decoded_path, 0);
-    for (size_t i = 0; i < seq_num; ++i) {
-      int start_pos = static_cast<int>(lod[level][i]);
-      int end_pos = static_cast<int>(lod[level][i + 1]);
-      Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos);
-      Decode(emission_weights->Slice(start_pos, end_pos), *transition_weights,
-             &decoded_path_one_seq);
-    }
-
-    if (label) {
-      PADDLE_ENFORCE_EQ(label->NumLevels(), 1UL,
-                        "The Input(Label) should be a sequence.");
-      const int64_t* label_value = label->data<int64_t>();
-      size_t batch_size = emission_weights->dims()[0];
-      for (size_t i = 0; i < batch_size; ++i) {
-        path[i] = label_value[i] == path[i] ? 1 : 0;
-      }
-    }
-  }
-
- private:
-  void Decode(const Tensor& emission_weights, const Tensor& transition_weights,
-              Tensor* decoded_path) const {
-    auto emission_dims = emission_weights.dims();
-    const size_t seq_len = emission_dims[0];
-    const size_t tag_num = emission_dims[1];
-
-    const size_t state_trans_base_idx = 2;
-
-    const T* x = emission_weights.data<T>();
-    const T* w = transition_weights.data<T>();
-    int64_t* path = decoded_path->data<int64_t>();
-
-    // alpha is a memo table. An element alpha(k, v) records the score of the
-    // best sequence of tags from position 1 to position k with v being the end
-    // tag.
-    Tensor alpha;
-    T* alpha_value = alpha.mutable_data<T>(emission_dims, platform::CPUPlace());
-    Tensor track;
-    int* track_value =
-        track.mutable_data<int>(emission_dims, platform::CPUPlace());
-
-    for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
-
-    for (size_t k = 1; k < seq_len; ++k) {
-      for (size_t i = 0; i < tag_num; ++i) {
-        T max_score = -std::numeric_limits<T>::max();
-        int max_j = 0;
-        for (size_t j = 0; j < tag_num; ++j) {
-          T score = alpha_value[(k - 1) * tag_num + j] +
-                    w[(j + state_trans_base_idx) * tag_num + i];
-          if (score > max_score) {
-            max_score = score;
-            max_j = j;
-          }
-        }
-
-        alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
-        track_value[k * tag_num + i] = max_j;
-      }
-    }
-
-    T max_score = -std::numeric_limits<T>::max();
-    int max_i = 0;
-    for (size_t i = 0; i < tag_num; ++i) {
-      T score = alpha_value[(seq_len - 1) * tag_num + i] + w[tag_num + i];
-      if (score > max_score) {
-        max_score = score;
-        max_i = i;
-      }
-    }
-    path[seq_len - 1] = max_i;
-    for (int k = seq_len - 1; k >= 1; --k) {
-      path[k - 1] = max_i = track_value[k * tag_num + max_i];
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc
deleted file mode 100644
index 310e351443112c340054cf092cd2443b309ec49c..0000000000000000000000000000000000000000
--- a/paddle/operators/crop_op.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/crop_op.h"
-#include <boost/lexical_cast.hpp>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class CropOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of CropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of CropOp should not be null.");
-    auto x_dim = ctx->GetInputDim("X");
-    if (!ctx->HasInput("Y")) {
-      auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-      PADDLE_ENFORCE_EQ(
-          int64_t(shape.size()), x_dim.size(),
-          "Shape size should be equal to dimention size of input tensor.");
-      std::vector<int64_t> tensor_shape(shape.size());
-      for (size_t i = 0; i < shape.size(); ++i) {
-        tensor_shape[i] = static_cast<int64_t>(shape[i]);
-      }
-      ctx->SetOutputDim("Out", framework::make_ddim(tensor_shape));
-    } else {
-      auto y_dim = ctx->GetInputDim("Y");
-      PADDLE_ENFORCE_EQ(framework::arity(x_dim), framework::arity(y_dim),
-                        "Tensor rank of both CropOp's "
-                        "inputs must be same.");
-      ctx->SetOutputDim("Out", y_dim);
-    }
-  }
-};
-
-class CropOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CropOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "The input of pad op. "
-             "The input should be a k-D tensor(k > 0 and k < 7).");
-    AddInput("Y",
-             "The input used as reference for cropping, "
-             "which is of the same dimensions as X.")
-        .AsDispensable();
-    AddOutput("Out",
-              "The output of crop op, "
-              "which is of the same dimensions as X.");
-    AddAttr<std::vector<int>>("offsets",
-                              "A list<int> describing offsets to be cropped. "
-                              "The size of offsets list should be the same as "
-                              "the dimension size of input X.");
-    AddAttr<std::vector<int>>("shape",
-                              "A list<int> describing the shape of output. "
-                              "The size of shape list should be the same as "
-                              "the dimension size of input X.")
-        .SetDefault(std::vector<int>());
-    AddComment(R"DOC(
-Crop Operator.
-
-Crop input into output, as specified by offsets and shape.
-
-There are two ways to set shape:
-1. reference input: crop input X into the same shape as reference input.
-                    The dimension of reference input should
-                    be the same as the dimension of input X.
-2. shape list: crop input X into the shape described by a list<int>.
-               The size of shape list should be the same as
-               the dimension size of input X.
-
-The input should be a k-D tensor(k > 0 and k < 7). As an example:
-
-Case 1:
-Given
-
-    X = [[0, 1, 2, 0, 0]
-         [0, 3, 4, 0, 0]
-         [0, 0, 0, 0, 0]],
-
-and
-
-    offsets = [0, 1],
-
-and
-
-    shape = [2, 2],
-
-we get:
-
-    Out = [[1, 2],
-           [3, 4]].
-
-
-Case 2:
-Given
-
-    X = [[0, 1, 2, 5, 0]
-         [0, 3, 4, 6, 0]
-         [0, 0, 0, 0, 0]],
-
-and
-
-    offsets = [0, 1],
-
-and
-
-    Y = [[0, 0, 0]
-         [0, 0, 0]],
-
-we get:
-
-    Out = [[1, 2, 5],
-           [3, 4, 6]].
-)DOC");
-  }
-};
-
-class CropOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(crop, ops::CropOp, ops::CropOpMaker, crop_grad, ops::CropOpGrad);
-REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
-REGISTER_OP_CPU_KERNEL(
-    crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/crop_op.cu b/paddle/operators/crop_op.cu
deleted file mode 100644
index bba5db4c6ce682cb00482e35fa1e340aba83e37f..0000000000000000000000000000000000000000
--- a/paddle/operators/crop_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/crop_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>);
-REGISTER_OP_CUDA_KERNEL(
-    crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/crop_op.h b/paddle/operators/crop_op.h
deleted file mode 100644
index 69d1a92977250b4e8a64b47ac66444724fbc53f6..0000000000000000000000000000000000000000
--- a/paddle/operators/crop_op.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/strided_memcpy.h"
-
-namespace paddle {
-namespace operators {  // Internal
-
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-using framework::Tensor;
-
-template <typename T>
-class CropKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
-    const T* x_data = x->data<T>();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    auto x_stride = framework::stride(x->dims());
-    auto out_stride = framework::stride(out->dims());
-    auto offsets = context.Attr<std::vector<int>>("offsets");
-    PADDLE_ENFORCE_EQ(
-        x->dims().size(), static_cast<int64_t>(offsets.size()),
-        "Offsets size should be equal to dimension size of input tensor.");
-    int64_t offset = 0;
-    for (size_t i = 0; i < offsets.size(); ++i) {
-      offset += (x_stride[i] * offsets[i]);
-    }
-    StridedMemcpy<T>(context.device_context(), x_data + offset, x_stride,
-                     out->dims(), out_stride, out_data);
-  }
-};
-
-template <typename DeviceContext, typename T, size_t D>
-void CropGradFunction(const framework::ExecutionContext& context) {
-  auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-  if (d_x != nullptr) {
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    d_x->mutable_data<T>(context.GetPlace());
-    auto offsets = context.Attr<std::vector<int>>("offsets");
-    Eigen::array<std::pair<int, int>, D> paddings;
-    for (size_t i = 0; i < D; ++i) {
-      paddings[i].first = offsets[i];
-      paddings[i].second = d_x->dims()[i] - d_out->dims()[i] - offsets[i];
-    }
-    auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
-    auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
-    d_x_tensor.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        d_out_tensor.pad(paddings, 0);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class CropGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    size_t rank =
-        context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
-    switch (rank) {
-      case 1:
-        CropGradFunction<DeviceContext, T, 1>(context);
-        break;
-      case 2:
-        CropGradFunction<DeviceContext, T, 2>(context);
-        break;
-      case 3:
-        CropGradFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        CropGradFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        CropGradFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        CropGradFunction<DeviceContext, T, 6>(context);
-        break;
-      default:
-        PADDLE_THROW(
-            "CropOp only support tensors with no more than 6 dimensions.");
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
deleted file mode 100644
index 7abd5b1c61d610f4f723b13ad6ce61791b96d06d..0000000000000000000000000000000000000000
--- a/paddle/operators/cross_entropy_op.cc
+++ /dev/null
@@ -1,173 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/cross_entropy_op.h"
-
-namespace paddle {
-namespace operators {
-
-class CrossEntropyOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
-                      "Input(Label)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
-                      "The 1st dimension of Input(X) and Input(Label) should "
-                      "be equal.");
-    if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
-                        "If Attr(soft_label) == true, the 2nd dimension of "
-                        "Input(X) and Input(Label) should be equal.");
-    } else {
-      PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
-                        "If Attr(softLabel) == false, the 2nd dimension of "
-                        "Input(Label) should be 1.");
-    }
-
-    ctx->SetOutputDim("Y", {x_dims[0], 1});
-    ctx->ShareLoD("X", /*->*/ "Y");
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of cross_entropy
-  // is determined by its input "X".
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
-  }
-};
-
-class CrossEntropyGradientOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input(Y@GRAD) shoudl be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output(X@GRAD) should be not null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
-    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
-                      "The 1st dimension of Input(X) and Input(Label) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0],
-                      "The 1st dimension of Input(X) and Input(Y@Grad) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(dy_dims[1], 1,
-                      "The 2nd dimension of Input(Y@Grad) should be 1.");
-    if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
-                        "When Attr(soft_label) == true, the 2nd dimension of "
-                        "Input(X) and Input(Label) should be equal.");
-    } else {
-      PADDLE_ENFORCE_EQ(label_dims[1], 1,
-                        "When Attr(soft_label) == false, the 2nd dimension of "
-                        "Input(Label) should be 1.");
-    }
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->ShareLoD("X", framework::GradVarName("X"));
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of cross_entropy
-  // is determined by its input "X".
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
-  }
-};
-
-class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x D],"
-             " where N is the batch size and D is the number of classes. "
-             "This input is a probability computed by the previous operator, "
-             "which is almost always the result of a softmax operator.");
-    AddInput("Label",
-             "(Tensor), the ground truth which is a 2-D tensor. When "
-             "soft_label is set to false, Label is a Tensor<int64> with shape "
-             "[N x 1]. When soft_label is set to true, Label is a "
-             "Tensor<float/double> with shape [N x D].");
-    AddOutput("Y",
-              "(Tensor, default Tensor<float>), a 2-D tensor with shape "
-              "[N x 1]. The cross entropy loss.");
-    AddAttr<bool>("soft_label",
-                  "(bool, default false), a flag indicating whether to "
-                  "interpretate the given labels as soft labels.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-CrossEntropy Operator.
-
-It supports both standard cross-entropy and soft-label cross-entropy loss
-computation.
-1) One-hot cross-entropy:
-    soft_label = false, Label[i, 0] indicates the class index for sample i:
-
-                $Y[i] = -\log(X[i, Label[i]])$
-
-2) Soft-label cross-entropy:
-    soft_label = true, Label[i, j] indicates the soft label of class j
-    for sample i:
-
-                $Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}$
-
-   Please make sure that in this case the summuation of each row of Label
-   equals one.
-
-3) One-hot cross-entropy with vecterized Input(Label):
-     As a special case of 2), when each row of Input(Label) has only one
-     non-zero element (equals 1), soft-label cross-entropy degenerates to a
-     one-hot cross-entropy with one-hot label representation.
-
-Both the input X and Label can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input X.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
-            cross_entropy_grad, ops::CrossEntropyGradientOp);
-REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<float>,
-                       ops::CrossEntropyOpKernel<double>);
-REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
-                       ops::CrossEntropyGradientOpKernel<float>,
-                       ops::CrossEntropyGradientOpKernel<double>);
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
deleted file mode 100644
index 3b04894e6ccb08c13e2d24bb38196fdc7935bf9e..0000000000000000000000000000000000000000
--- a/paddle/operators/cross_entropy_op.cu
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/cross_entropy_op.h"
-
-namespace paddle {
-namespace operators {
-
-namespace {
-
-template <typename T>
-__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
-                                           const int64_t* label, const int N,
-                                           const int D) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    int idx = i * D + label[i];
-    dX[idx] = -dY[i] / X[idx];
-  }
-}
-
-template <typename T>
-__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
-                                               const T* label, const int N,
-                                               const int D) {
-  int ids = blockIdx.x * blockDim.x + threadIdx.x;
-  if (ids < N * D) {
-    int row_ids = ids / D;
-    dX[ids] = -label[ids] * dY[row_ids] / X[ids];
-  }
-}
-}  // namespace
-
-template <typename T>
-class CrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* label = ctx.Input<Tensor>("Label");
-    Tensor* y = ctx.Output<Tensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-
-    math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
-        ctx.template device_context<platform::CUDADeviceContext>(), y, x, label,
-        ctx.Attr<bool>("soft_label"));
-  }
-};
-
-template <typename T>
-class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* label = ctx.Input<Tensor>("Label");
-    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    const T* dy_data =
-        ctx.Input<Tensor>(framework::GradVarName("Y"))->data<T>();
-    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    const T* x_data = x->data<T>();
-
-    int64_t batch_size = x->dims()[0];
-    int64_t class_num = x->dims()[1];
-
-    int block = 512;
-    int grid = (batch_size * class_num + block - 1) / block;
-
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto stream = dev_ctx.stream();
-
-    if (ctx.Attr<bool>("soft_label")) {
-      auto* label_data = label->data<T>();
-      SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
-          dx_data, dy_data, x_data, label_data, batch_size, class_num);
-    } else {
-      math::SetConstant<platform::CUDADeviceContext, T> functor;
-      functor(dev_ctx, dx, 0);
-      auto* label_data = label->data<int64_t>();
-      grid = (batch_size + block - 1) / block;
-      CrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
-          dx_data, dy_data, x_data, label_data, batch_size, class_num);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>,
-                        ops::CrossEntropyOpCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
-                        ops::CrossEntropyGradientOpCUDAKernel<float>,
-                        ops::CrossEntropyGradientOpCUDAKernel<double>);
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
deleted file mode 100644
index 5623d2ded16daaf51dd26c9d9a8c04a0ae5be5ec..0000000000000000000000000000000000000000
--- a/paddle/operators/cross_entropy_op.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/cross_entropy.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-class CrossEntropyOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "This kernel only runs on CPU.");
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* labels = ctx.Input<Tensor>("Label");
-    Tensor* y = ctx.Output<Tensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-
-    math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
-        ctx.template device_context<platform::CPUDeviceContext>(), y, x, labels,
-        ctx.Attr<bool>("soft_label"));
-  }
-};
-
-template <typename T>
-class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "This kernel only runs on CPU.");
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const Tensor* label = ctx.Input<Tensor>("Label");
-    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-
-    int64_t class_num = x->dims()[1];
-    if (ctx.Attr<bool>("soft_label")) {
-      auto x_mat = EigenMatrix<T>::From(*x);
-      auto dy_mat = EigenMatrix<T>::From(*dy);
-      auto lbl_mat = EigenMatrix<T>::From(*label);
-      auto dx_mat = EigenMatrix<T>::From(*dx);
-
-      dx_mat.device(*ctx.template device_context<platform::CPUDeviceContext>()
-                         .eigen_device()) =
-          -(lbl_mat *
-            dy_mat.broadcast(Eigen::DSizes<int64_t, 2>(1, class_num)) / x_mat);
-    } else {
-      int64_t batch_size = x->dims()[0];
-      const T* dy_data = dy->data<T>();
-      const T* x_data = x->data<T>();
-      const int64_t* label_data = label->data<int64_t>();
-
-      math::SetConstant<platform::CPUDeviceContext, T> functor;
-      functor(ctx.template device_context<platform::CPUDeviceContext>(), dx, 0);
-
-      for (int64_t i = 0; i < batch_size; ++i) {
-        PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
-        int64_t index = i * class_num + label_data[i];
-        dx_data[index] = -dy_data[i] / x_data[index];
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/ctc_align_op.cc b/paddle/operators/ctc_align_op.cc
deleted file mode 100644
index eeecbd32127d2cf9756432817fc5d36673685aa7..0000000000000000000000000000000000000000
--- a/paddle/operators/ctc_align_op.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/ctc_align_op.h"
-
-namespace paddle {
-namespace operators {
-
-class CTCAlignOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input of CTCAlignOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Output"),
-                   "Output of CTCAlignOp should not be null.");
-
-    auto input_dims = ctx->GetInputDim("Input");
-
-    // TODO(wanghaoshuang): it is tricky to set the wrong dimension here.
-    ctx->SetOutputDim("Output", input_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
-        ctx.device_context());
-  }
-};
-
-class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CTCAlignOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Input",
-             "(LodTensor, default: LoDTensor<int>), Its shape is "
-             "[Lp, 1], where Lp is the sum of all input sequences' length.");
-    AddOutput("Output", "(Tensor, default: Tensor<int>), The align result.");
-    AddAttr<int>("blank",
-                 "(int, default: 0), the blank label setted in Connectionist "
-                 "Temporal Classification (CTC) op.")
-        .SetDefault(0);
-    AddAttr<bool>("merge_repeated",
-                  "(bool, default: true), whether to "
-                  "merge repeated elements between two blanks. ")
-        .SetDefault(true);
-    AddComment(R"DOC(
-CTCAlign op is used to merge repeated elements between two blanks
-and then delete all blanks in sequence.
-
-Given:
-    Input.data = [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6,
-                  6, 0, 0, 7, 7, 7, 0]
-    Input.dims = {18, 1}
-    Input.LoD = [[0, 11, 18]]
-
-And:
-    blank = 0
-    merge_repeated = True
-
-Then:
-    Output.data = [1, 2, 4, 4, 5, 6,
-                   6, 7]
-    Output.dims = {8, 1}
-    Output.LoD = [[0, 6, 8]]
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(ctc_align, ops::CTCAlignOp, ops::CTCAlignOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    ctc_align, ops::CTCAlignKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::CTCAlignKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/ctc_align_op.cu b/paddle/operators/ctc_align_op.cu
deleted file mode 100644
index cea595d7c5d461b40198e622abf08248e7ca69e1..0000000000000000000000000000000000000000
--- a/paddle/operators/ctc_align_op.cu
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdio.h>
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include "paddle/operators/ctc_align_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens,
-                                      const size_t num_seq, size_t* lod0,
-                                      const int blank, const int merge_repeated,
-                                      size_t* out_lod0, T* output) {
-  int ouput_idx = 0;
-  out_lod0[0] = 0;
-
-  for (int i = 0; i < num_seq; ++i) {
-    T pre_token = -1;
-    for (int j = lod0[i]; j < lod0[i + 1]; ++j) {
-      if (tokens[j] != blank && !(merge_repeated && tokens[j] == pre_token)) {
-        output[ouput_idx] = tokens[j];
-        ++ouput_idx;
-      }
-      pre_token = tokens[j];
-    }
-    out_lod0[i + 1] = ouput_idx;
-  }
-}
-
-template <typename T>
-class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    const size_t level = 0;
-    auto* input = ctx.Input<LoDTensor>("Input");
-    auto* output = ctx.Output<LoDTensor>("Output");
-    auto input_lod = framework::ToAbsOffset(input->lod());
-
-    const T* tokens = input->data<T>();
-    const int64_t num_tokens = input->dims()[0];
-    const size_t num_seq = input_lod[level].size() - 1;
-
-    const int blank = ctx.Attr<int>("blank");
-    const int merge_repeated =
-        static_cast<int>(ctx.Attr<bool>("merge_repeated"));
-
-    // prepare a lod to record lod information while merging elements
-    thrust::device_vector<size_t> dev_out_lod0(input_lod[level].size());
-    size_t* dev_out_lod0_ptr = thrust::raw_pointer_cast(dev_out_lod0.data());
-
-    // merge elements and delete blank
-    T* output_data = output->mutable_data<T>({num_tokens, 1}, ctx.GetPlace());
-
-    auto stream = ctx.cuda_device_context().stream();
-    MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
-        num_tokens, tokens, num_seq, input_lod[level].cuda_data(), blank,
-        merge_repeated, dev_out_lod0_ptr, output_data);
-
-    // set output lod
-    std::vector<size_t> host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end());
-    framework::LoD out_lod;
-    out_lod.push_back(host_out_lod0);
-    output->set_lod(out_lod);
-
-    // resize output dims
-    output->Resize({static_cast<int64_t>(host_out_lod0.back()), 1});
-
-    if (host_out_lod0.back() == 0) {
-      output->Resize({1, 1});
-      output->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<platform::CUDADeviceContext, T> set_constant;
-      set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
-                   output, -1);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(ctc_align, paddle::operators::CTCAlignOpCUDAKernel<int>,
-                        paddle::operators::CTCAlignOpCUDAKernel<int64_t>);
diff --git a/paddle/operators/ctc_align_op.h b/paddle/operators/ctc_align_op.h
deleted file mode 100644
index 54ad1d6f5cc96c884c9e0c101c44d8d629792f8f..0000000000000000000000000000000000000000
--- a/paddle/operators/ctc_align_op.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string.h>
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename DeviceContext, typename T>
-class CTCAlignKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<LoDTensor>("Input");
-    auto* output = ctx.Output<LoDTensor>("Output");
-    const size_t level = 0;
-    auto input_lod = framework::ToAbsOffset(input->lod());
-
-    // check input dims and lod
-    auto input_dims = input->dims();
-    PADDLE_ENFORCE_EQ(input_dims[0],
-                      static_cast<int64_t>(input_lod[level].back()),
-                      "The first dimension of Input(Input) should be equal to "
-                      "the sum of all sequences' lengths.");
-
-    const size_t num_sequences = input_lod[level].size() - 1;
-    size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
-    bool merge_repeated = ctx.Attr<bool>("merge_repeated");
-
-    // merge repeated tokens and delete blank
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    size_t output_idx = 0;
-    std::vector<size_t> output_lod0(1, 0);
-    const T* input_data = input->data<T>();
-    for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) {
-      T prev_token = -1;
-      for (size_t i = input_lod[level][seq_idx];
-           i < input_lod[level][seq_idx + 1]; ++i) {
-        if ((unsigned)input_data[i] != blank &&
-            !(merge_repeated && input_data[i] == prev_token)) {
-          output_data[output_idx] = input_data[i];
-          ++output_idx;
-        }
-        prev_token = input_data[i];
-      }
-      output_lod0.push_back(output_idx);
-    }
-
-    // set output lod
-    framework::LoD output_lod;
-    output_lod.push_back(output_lod0);
-    output->set_lod(output_lod);
-    // resize output dims
-    output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
-    // for empty sequence
-    if (output_lod0.back() == 0) {
-      output->Resize({1, 1});
-      output_data = output->mutable_data<T>(ctx.GetPlace());
-      output_data[0] = -1;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/decayed_adagrad_op.cc b/paddle/operators/decayed_adagrad_op.cc
deleted file mode 100644
index 739a8d881c35817756421a3299901c9e5e7d96ba..0000000000000000000000000000000000000000
--- a/paddle/operators/decayed_adagrad_op.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/decayed_adagrad_op.h"
-
-namespace paddle {
-namespace operators {
-
-class DecayedAdagradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment"),
-                   "Input(Moment) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("LearningRate"),
-        "Input(LearningRate) of DecayedAdagradOp should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
-                   "Output(MomentOut) of DecayedAdagradOp should not be null.");
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "LearningRate should have one element");
-    auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Grad"),
-                      "Param and Grad input of DecayedAdagradOp should have "
-                      "the same dimension.");
-    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Moment"),
-                      "Param and Moment input of DecayedAdagradOp should have "
-                      "the same dimension.");
-
-    ctx->SetOutputDim("ParamOut", param_dims);
-    ctx->SetOutputDim("MomentOut", param_dims);
-  }
-};
-
-class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  DecayedAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Param", "(Tensor) Input parameter");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("Moment", "(Tensor) Second moment");
-    AddInput("LearningRate", "(Tensor) Learning rate");
-
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-    AddOutput("MomentOut", "(Tensor) Output second moment");
-
-    AddAttr<float>("decay",
-                   "(float, default 0.95) "
-                   "Discounting factor for coming gradient")
-        .SetDefault(0.95);
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-6) "
-                   "Constant for numerical stability")
-        .SetDefault(1.0e-6f);
-    AddComment(R"DOC(
-Decayed Adagrad Optimizer.
-
-The update is done as follows:
-
-$$
-moment\_out = decay * moment + (1 - decay) * grad * grad \\
-param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + epsilon}
-$$
-
-The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-does not have an epsilon attribute. It is added here for numerical
-stability to avoid the division by zero error.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad, ops::DecayedAdagradOp,
-                             ops::DecayedAdagradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    decayed_adagrad,
-    ops::DecayedAdagradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/decayed_adagrad_op.cu b/paddle/operators/decayed_adagrad_op.cu
deleted file mode 100644
index 7bc8161f2339572c2a9284f865846b9b7e594354..0000000000000000000000000000000000000000
--- a/paddle/operators/decayed_adagrad_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/decayed_adagrad_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    decayed_adagrad,
-    ops::DecayedAdagradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/decayed_adagrad_op.h b/paddle/operators/decayed_adagrad_op.h
deleted file mode 100644
index fec9705cfc1e14e5423e23d6afb218c6c051f5a1..0000000000000000000000000000000000000000
--- a/paddle/operators/decayed_adagrad_op.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class DecayedAdagradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
-
-    param_out_tensor->mutable_data<T>(ctx.GetPlace());
-    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    float decay = ctx.Attr<float>("decay");
-    float epsilon = ctx.Attr<float>("epsilon");
-
-    auto param = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Param"));
-    auto grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Grad"));
-    auto moment = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Moment"));
-    auto lr = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("LearningRate"));
-
-    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
-    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    moment_out.device(place) = decay * moment + (1 - decay) * grad * grad;
-    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-    param_out.device(place) =
-        param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/detail/grpc_client.cc b/paddle/operators/detail/grpc_client.cc
deleted file mode 100644
index 9b5f7afc6a48f13ff999f635efeb9e7bf0a76fb5..0000000000000000000000000000000000000000
--- a/paddle/operators/detail/grpc_client.cc
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "grpc_client.h"
-#include "paddle/framework/threadpool.h"
-namespace paddle {
-namespace operators {
-namespace detail {
-
-bool RPCClient::AsyncSendVariable(const std::string& ep,
-                                  const platform::DeviceContext& ctx,
-                                  const framework::Scope& scope,
-                                  const std::string& var_name,
-                                  int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-
-  framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] {
-    auto* var = p_scope->FindVar(var_name_val);
-    sendrecv::VariableMessage req;
-    SerializeToMessage(var_name_val, var, *p_ctx, &req);
-
-    // varhandle
-    VarHandle var_h;
-    var_h.ep = ep_val;
-    var_h.scope = p_scope;
-    var_h.name = var_name_val;
-    var_h.ctx = p_ctx;
-
-    // stub context
-    SendProcessor* s = new SendProcessor(ch);
-    s->Prepare(var_h, time_out);
-    s->response_call_back_ = NULL;
-
-    auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-    rpc->Finish(&s->reply_, &s->status_, (void*)s);
-  });
-
-  req_count_++;
-
-  return true;
-}
-
-void ProcGetResponse(const VarHandle& var_h,
-                     const sendrecv::VariableMessage& ret_msg) {
-  auto* outvar = var_h.scope->FindVar(var_h.name);
-  DeserializeFromMessage(ret_msg, *var_h.ctx, outvar);
-}
-
-bool RPCClient::AsyncGetVariable(const std::string& ep,
-                                 const platform::DeviceContext& ctx,
-                                 const framework::Scope& scope,
-                                 const std::string& var_name,
-                                 int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-
-  framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {
-    sendrecv::VariableMessage req;
-    req.set_varname(var_name_val);
-
-    // varhandle
-    VarHandle var_h;
-    var_h.ep = ep_val;
-    var_h.scope = p_scope;
-    var_h.name = var_name_val;
-    var_h.ctx = p_ctx;
-
-    // stub context
-    GetProcessor* s = new GetProcessor(ch);
-    s->Prepare(var_h, time_out);
-    s->response_call_back_ = ProcGetResponse;
-
-    auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
-    rpc->Finish(&s->reply_, &s->status_, (void*)s);
-  });
-
-  req_count_++;
-
-  return true;
-}
-
-bool RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
-  const auto ch = GetChannel(ep);
-
-  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  s->Prepare(time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(BATCH_BARRIER_MESSAGE);
-  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, (void*)s);
-  req_count_++;
-
-  return true;
-}
-
-bool RPCClient::Wait() {
-  if (req_count_ <= 0) {
-    return true;
-  }
-  const size_t kReqCnt = req_count_;
-  bool a[kReqCnt];
-  std::vector<std::future<void>> waits(req_count_);
-
-  for (int i = 0; i < req_count_; i++) {
-    waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); });
-  }
-
-  for (int i = 0; i < req_count_; i++) {
-    waits[i].wait();
-  }
-
-  int last_req_count = req_count_;
-  req_count_ = 0;
-
-  for (int i = 0; i < last_req_count; i++) {
-    if (!a[i]) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-bool RPCClient::Proceed() {
-  void* tag = NULL;
-  bool ok = false;
-
-  // request counts.
-  if (!cq_.Next(&tag, &ok)) {
-    LOG(ERROR) << "Get meets CompletionQueue error";
-    return false;
-  }
-
-  GPR_ASSERT(ok);
-  PADDLE_ENFORCE(tag);
-
-  // TODO(gongwb): add more retries.
-  ClientBase* c = static_cast<ClientBase*>(tag);
-  if (!c->status_.ok()) {
-    LOG(ERROR) << "proc param error:" << c->var_h_.String()
-               << " grpc error:" << c->status_.error_message();
-    delete c;
-    return false;
-  }
-
-  c->Process();
-  delete c;
-  return true;
-}
-
-std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
-  auto it = channels_.find(ep);
-  if (it != channels_.end()) {
-    return it->second;
-  }
-
-  grpc::ChannelArguments args;
-  args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
-  args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
-
-  auto ch = std::shared_ptr<grpc::Channel>(
-      grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args));
-
-  channels_[ep] = ch;
-  return ch;
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/detail/grpc_client.h b/paddle/operators/detail/grpc_client.h
deleted file mode 100644
index f9499f6dc70c541c214e0b659f10b2ed1e8e8581..0000000000000000000000000000000000000000
--- a/paddle/operators/detail/grpc_client.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <grpc++/grpc++.h>
-#include <grpc/support/log.h>
-#include <time.h>
-#include <chrono>
-#include <ctime>
-#include <functional>
-#include <iostream>
-#include <map>
-#include <string>
-#include <vector>
-
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/scope.h"
-#include "paddle/framework/selected_rows.h"
-#include "paddle/operators/detail/sendrecvop_utils.h"
-#include "paddle/operators/detail/simple_block_queue.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-struct VarHandle {
-  std::string ep;
-  const platform::DeviceContext* ctx;
-  const framework::Scope* scope;
-  std::string name;
-
-  std::string String() const {
-    std::ostringstream s;
-    s << "name:[" << name << "] ep:[" << ep << "]";
-    return s.str();
-  }
-};
-
-void ProcGetResponse(const VarHandle& var_h,
-                     const sendrecv::VariableMessage& msg);
-
-class ClientBase {
- public:
-  explicit ClientBase(std::shared_ptr<grpc::Channel> ch) {
-    stub_ = sendrecv::SendRecvService::NewStub(ch);
-    context_ = NULL;
-  }
-
-  virtual ~ClientBase() {}
-
-  virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
-    context_.reset(new grpc::ClientContext());
-    var_h_ = var_info;
-
-    std::chrono::system_clock::time_point deadline =
-        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
-
-    context_->set_deadline(deadline);
-  }
-
-  virtual void Prepare(int64_t time_out) {
-    context_.reset(new grpc::ClientContext());
-
-    std::chrono::system_clock::time_point deadline =
-        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
-
-    context_->set_deadline(deadline);
-  }
-
-  virtual void Process() = 0;
-
-  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
-  std::unique_ptr<grpc::ClientContext> context_;
-  grpc::Status status_;
-  VarHandle var_h_;
-};
-
-typedef std::function<void(const VarHandle&, const sendrecv::VoidMessage&)>
-    RequestSendCallBack;
-
-class SendProcessor : public ClientBase {
- public:
-  explicit SendProcessor(std::shared_ptr<grpc::Channel> ch) : ClientBase(ch) {}
-
-  virtual ~SendProcessor() {}
-
-  virtual void Process() {
-    if (response_call_back_) {
-      response_call_back_(var_h_, reply_);
-    }
-  }
-
-  sendrecv::VoidMessage reply_;
-  RequestSendCallBack response_call_back_ = NULL;
-};
-
-typedef std::function<void(const VarHandle&, const sendrecv::VariableMessage&)>
-    RequestGetCallBack;
-
-class GetProcessor : public ClientBase {
- public:
-  explicit GetProcessor(std::shared_ptr<grpc::Channel> ch) : ClientBase(ch) {}
-
-  virtual ~GetProcessor() {}
-
-  virtual void Process() {
-    if (response_call_back_) {
-      response_call_back_(var_h_, reply_);
-    }
-  }
-
-  sendrecv::VariableMessage reply_;
-  RequestGetCallBack response_call_back_ = ProcGetResponse;
-};
-
-class BatchBarrierProcessor : public ClientBase {
- public:
-  explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : ClientBase(ch) {}
-
-  virtual ~BatchBarrierProcessor() {}
-
-  virtual void Process() {}
-  sendrecv::VoidMessage reply_;
-};
-
-class RPCClient {
- public:
-  bool AsyncSendVariable(const std::string& ep,
-                         const platform::DeviceContext& ctx,
-                         const framework::Scope& scope,
-                         const std::string& var_name,
-                         int64_t time_out = 600 * 1000);
-
-  bool AsyncGetVariable(const std::string& ep,
-                        const platform::DeviceContext& ctx,
-                        const framework::Scope& scope,
-                        const std::string& var_name,
-                        int64_t time_out = 600 * 1000);
-
-  bool AsyncSendBatchBarrier(const std::string& ep,
-                             int64_t time_out = 600 * 1000);
-
-  bool Wait();
-
- private:
-  bool Proceed();
-  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
-
- private:
-  grpc::CompletionQueue cq_;
-  std::map<std::string, std::shared_ptr<grpc::Channel>> channels_;
-  int64_t req_count_ = 0;
-};
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/detail/grpc_server.cc b/paddle/operators/detail/grpc_server.cc
deleted file mode 100644
index 4f94e1315fbd2810a05354f7c3fc54ea30967e8a..0000000000000000000000000000000000000000
--- a/paddle/operators/detail/grpc_server.cc
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/detail/grpc_server.h"
-
-using grpc::ServerAsyncResponseWriter;
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-enum CallStatus { PROCESS = 0, FINISH };
-
-// reference:
-// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
-class RequestBase {
- public:
-  explicit RequestBase(sendrecv::SendRecvService::AsyncService* service,
-                       grpc::ServerCompletionQueue* cq)
-      : service_(service), cq_(cq), status_(PROCESS) {
-    PADDLE_ENFORCE(cq_);
-  }
-  virtual ~RequestBase() {}
-  virtual void Process() { assert(false); }
-
-  CallStatus Status() { return status_; }
-  void SetStatus(CallStatus status) { status_ = status; }
-  virtual std::string GetReqName() {
-    assert(false);
-    return "";
-  }
-
- protected:
-  grpc::ServerContext ctx_;
-  sendrecv::SendRecvService::AsyncService* service_;
-  grpc::ServerCompletionQueue* cq_;
-  CallStatus status_;
-};
-
-typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
-
-class RequestSend final : public RequestBase {
- public:
-  explicit RequestSend(sendrecv::SendRecvService::AsyncService* service,
-                       grpc::ServerCompletionQueue* cq,
-                       SimpleBlockQueue<MessageWithName>* queue)
-      : RequestBase(service, cq), queue_(queue), responder_(&ctx_) {
-    service_->RequestSendVariable(&ctx_, &request_, &responder_, cq_, cq_,
-                                  this);
-  }
-
-  virtual ~RequestSend() {}
-
-  virtual std::string GetReqName() { return request_.varname(); }
-
-  virtual void Process() {
-    MessageWithName msg_with_name =
-        std::make_pair(request_.varname(), std::move(request_));
-    queue_->Push(std::move(msg_with_name));
-    responder_.Finish(reply_, grpc::Status::OK, this);
-    status_ = FINISH;
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  sendrecv::VoidMessage reply_;
-  SimpleBlockQueue<MessageWithName>* queue_;
-  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
-};
-
-class RequestGet final : public RequestBase {
- public:
-  explicit RequestGet(sendrecv::SendRecvService::AsyncService* service,
-                      grpc::ServerCompletionQueue* cq, framework::Scope* scope,
-                      const platform::DeviceContext* dev_ctx,
-                      SimpleBlockQueue<char>* queue)
-      : RequestBase(service, cq),
-        responder_(&ctx_),
-        scope_(scope),
-        dev_ctx_(dev_ctx),
-        queue_(queue) {
-    service_->RequestGetVariable(&ctx_, &request_, &responder_, cq_, cq_, this);
-  }
-
-  virtual ~RequestGet() {}
-
-  virtual std::string GetReqName() { return request_.varname(); }
-
-  virtual void Process() {
-    // proc request.
-    std::string var_name = request_.varname();
-    auto* var = scope_->FindVar(var_name);
-    SerializeToMessage(var_name, var, *dev_ctx_, &reply_);
-    // TODO(gongwb): check var's info.
-    responder_.Finish(reply_, grpc::Status::OK, this);
-    status_ = FINISH;
-    queue_->Push('c');
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  sendrecv::VariableMessage reply_;
-  ServerAsyncResponseWriter<sendrecv::VariableMessage> responder_;
-  framework::Scope* scope_;
-  const platform::DeviceContext* dev_ctx_;
-  SimpleBlockQueue<char>* queue_;
-};
-
-void AsyncGRPCServer::WaitClientGet(int count) {
-  for (int i = 0; i < count; ++i) {
-    var_get_queue_.Pop();
-  }
-}
-
-void AsyncGRPCServer::RunSyncUpdate() {
-  grpc::ServerBuilder builder;
-  builder.AddListeningPort(address_, grpc::InsecureServerCredentials());
-  builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
-  builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
-  builder.RegisterService(&service_);
-
-  cq_send_ = builder.AddCompletionQueue();
-  cq_get_ = builder.AddCompletionQueue();
-
-  server_ = builder.BuildAndStart();
-  LOG(INFO) << "Server listening on " << address_ << std::endl;
-
-  std::function<void()> send_register =
-      std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
-  std::function<void()> get_register =
-      std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
-
-  t_send_.reset(
-      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
-                                cq_send_.get(), "cq_send", send_register)));
-
-  t_get_.reset(
-      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
-                                cq_get_.get(), "cq_get", get_register)));
-
-  // wait server
-  server_->Wait();
-  t_send_->join();
-  t_get_->join();
-}
-
-void AsyncGRPCServer::ShutdownQueue() {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  cq_send_->Shutdown();
-  cq_get_->Shutdown();
-  is_shut_down_ = true;
-}
-
-// This URL explains why shutdown is complicate:
-void AsyncGRPCServer::ShutDown() {
-  server_->Shutdown();
-  ShutdownQueue();
-}
-
-void AsyncGRPCServer::TryToRegisterNewSendOne() {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  if (is_shut_down_) {
-    return;
-  }
-  RequestSend* send =
-      new RequestSend(&service_, cq_send_.get(), &var_recv_queue_);
-  VLOG(4) << "Create RequestSend status:" << send->Status();
-}
-
-void AsyncGRPCServer::TryToRegisterNewGetOne() {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  if (is_shut_down_) {
-    return;
-  }
-  RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_,
-                                   &var_get_queue_);
-  VLOG(4) << "Create RequestGet status:" << get->Status();
-}
-
-// FIXME(typhoonzero): change cq_name to enum.
-void AsyncGRPCServer::HandleRequest(grpc::ServerCompletionQueue* cq,
-                                    std::string cq_name,
-                                    std::function<void()> TryToRegisterNewOne) {
-  TryToRegisterNewOne();
-
-  void* tag = NULL;
-  bool ok = false;
-  while (true) {
-    if (!cq->Next(&tag, &ok)) {
-      LOG(INFO) << cq_name << " get CompletionQueue shutdown!";
-      break;
-    }
-
-    PADDLE_ENFORCE(tag);
-    // FIXME(typhoonzero): de-couple the barriers with recv_op
-    if (cq_name == "cq_get") WaitCond(1);
-    if (cq_name == "cq_send") WaitCond(0);
-
-    RequestBase* base = (RequestBase*)tag;
-    // reference:
-    // https://github.com/tensorflow/tensorflow/issues/5596
-    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
-    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
-    if (!ok) {
-      LOG(WARNING) << cq_name << " recv no regular event:argument name"
-                   << base->GetReqName();
-      TryToRegisterNewOne();
-      delete base;
-      continue;
-    }
-
-    switch (base->Status()) {
-      case PROCESS: {
-        VLOG(4) << cq_name << " status:" << base->Status();
-        TryToRegisterNewOne();
-        base->Process();
-        break;
-      }
-      case FINISH: {
-        VLOG(4) << cq_name << " status:" << base->Status();
-        delete base;
-        break;
-      }
-      default: { assert(false); }
-    }
-  }
-}
-
-void AsyncGRPCServer::WaitCond(int cond) {
-  std::unique_lock<std::mutex> lock(this->barrier_mutex_);
-  barrier_condition_.wait(lock,
-                          [=] { return this->barrier_cond_step_ == cond; });
-}
-
-void AsyncGRPCServer::SetCond(int cond) {
-  {
-    std::lock_guard<std::mutex> lock(this->barrier_mutex_);
-    barrier_cond_step_ = cond;
-  }
-  barrier_condition_.notify_all();
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/detail/grpc_server.h b/paddle/operators/detail/grpc_server.h
deleted file mode 100644
index 3f8b9d93176148619d6820f6a365d9da2e73b10d..0000000000000000000000000000000000000000
--- a/paddle/operators/detail/grpc_server.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/scope.h"
-#include "paddle/framework/selected_rows.h"
-#include "paddle/framework/var_type.h"
-#include "paddle/operators/detail/simple_block_queue.h"
-
-#include "paddle/operators/detail/send_recv.grpc.pb.h"
-#include "paddle/operators/detail/send_recv.pb.h"
-
-#include <grpc++/grpc++.h>
-#include <grpc/support/log.h>
-#include <thread>
-#include "paddle/operators/detail/sendrecvop_utils.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
-class RequestBase;
-
-class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
- public:
-  explicit AsyncGRPCServer(const std::string &address) : address_(address) {}
-
-  void RunSyncUpdate();
-
-  // functions to sync server barrier status.
-  void WaitCond(int cond);
-  void SetCond(int cond);
-  void WaitClientGet(int count);
-
-  void SetScope(framework::Scope *scope) { scope_ = scope; }
-
-  void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; }
-
-  const MessageWithName Get() { return this->var_recv_queue_.Pop(); }
-
-  void Push(const MessageWithName &msg) { this->var_recv_queue_.Push(msg); }
-
-  void ShutDown();
-
- protected:
-  void HandleRequest(grpc::ServerCompletionQueue *cq, std::string cq_name,
-                     std::function<void()> TryToRegisterNewOne);
-  void TryToRegisterNewSendOne();
-  void TryToRegisterNewGetOne();
-  void ShutdownQueue();
-
- private:
-  std::mutex cq_mutex_;
-  volatile bool is_shut_down_ = false;
-  std::unique_ptr<grpc::ServerCompletionQueue> cq_send_;
-  std::unique_ptr<grpc::ServerCompletionQueue> cq_get_;
-
-  sendrecv::SendRecvService::AsyncService service_;
-  std::unique_ptr<grpc::Server> server_;
-
-  std::string address_;
-  framework::Scope *scope_;
-  const platform::DeviceContext *dev_ctx_;
-  // received variable from RPC, operators fetch variable from this queue.
-  SimpleBlockQueue<MessageWithName> var_recv_queue_;
-  SimpleBlockQueue<char> var_get_queue_;
-
-  // condition of the sub program
-  std::mutex barrier_mutex_;
-  mutable int barrier_cond_step_;
-  std::condition_variable barrier_condition_;
-
-  std::unique_ptr<std::thread> t_send_;
-  std::unique_ptr<std::thread> t_get_;
-};
-
-};  // namespace detail
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/operators/detail/sendrecvop_utils.cc b/paddle/operators/detail/sendrecvop_utils.cc
deleted file mode 100644
index 7635b9e8dbdff624bb42a9de346b8d05a980f9b6..0000000000000000000000000000000000000000
--- a/paddle/operators/detail/sendrecvop_utils.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/detail/sendrecvop_utils.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-void SerializeToMessage(const std::string& name, const framework::Variable* var,
-                        const platform::DeviceContext& ctx,
-                        sendrecv::VariableMessage* msg) {
-  msg->set_varname(name);
-  std::ostringstream oss;
-  switch (framework::ToVarType(var->Type())) {
-    case framework::proto::VarDesc_VarType_LOD_TENSOR:
-      msg->set_type(sendrecv::VarType::LOD_TENSOR);
-      framework::SerializeToStream(oss, var->Get<framework::LoDTensor>(), ctx);
-      break;
-    case framework::proto::VarDesc_VarType_SELECTED_ROWS:
-      msg->set_type(sendrecv::VarType::SELECTED_ROWS);
-      framework::SerializeToStream(oss, var->Get<framework::SelectedRows>(),
-                                   ctx);
-      break;
-    default: {
-      PADDLE_THROW("Serialize does not support type: %s",
-                   typeid(var->Type()).name());
-      break;
-    }
-  }
-  msg->set_serialized(oss.str());
-}
-
-void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
-                            const platform::DeviceContext& ctx,
-                            framework::Variable* var) {
-  std::istringstream iss(msg.serialized());
-  switch (msg.type()) {
-    case sendrecv::VarType::LOD_TENSOR:
-      DeserializeFromStream(iss, var->GetMutable<framework::LoDTensor>(), ctx);
-      break;
-    case sendrecv::VarType::SELECTED_ROWS: {
-      DeserializeFromStream(iss, var->GetMutable<framework::SelectedRows>(),
-                            ctx);
-      break;
-    }
-    default: {
-      PADDLE_THROW("Deserialize does not support type: %s",
-                   typeid(var->Type()).name());
-      break;
-    }
-  }
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/detail/sendrecvop_utils.h b/paddle/operators/detail/sendrecvop_utils.h
deleted file mode 100644
index 8e66f7299c7b4d30bc5a6fe6a18b7cb3ae3827a5..0000000000000000000000000000000000000000
--- a/paddle/operators/detail/sendrecvop_utils.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/scope.h"
-#include "paddle/framework/selected_rows.h"
-#include "paddle/framework/var_type.h"
-
-#include "paddle/operators/detail/send_recv.grpc.pb.h"
-#include "paddle/operators/detail/send_recv.pb.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
-#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
-
-void SerializeToMessage(const std::string& name, const framework::Variable* var,
-                        const platform::DeviceContext& ctx,
-                        sendrecv::VariableMessage* msg);
-
-void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
-                            const platform::DeviceContext& ctx,
-                            framework::Variable* var);
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/detail/strided_memcpy.h b/paddle/operators/detail/strided_memcpy.h
deleted file mode 100644
index 9ed524d4dcf7f8bd4607281ade34e9d56f409085..0000000000000000000000000000000000000000
--- a/paddle/operators/detail/strided_memcpy.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/ddim.h"
-#include "paddle/memory/memcpy.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-template <typename T, int Rank>
-struct StridedMemcpyFunctor;
-
-template <typename T>
-struct StridedMemcpyFunctor<T, 1> {
-  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
-                  framework::Dim<1> src_stride, framework::Dim<1> dst_dim,
-                  framework::Dim<1> dst_stride, T* dst) const {
-    auto place = dev_ctx.GetPlace();
-    if (platform::is_cpu_place(place)) {
-      auto& cpu_place = boost::get<platform::CPUPlace>(place);
-      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head);
-    } else {
-#ifdef PADDLE_WITH_CUDA
-      auto& gpu_place = boost::get<platform::CUDAPlace>(place);
-      auto& cuda_ctx =
-          reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
-      memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head,
-                   cuda_ctx.stream());
-#else
-      PADDLE_THROW("Paddle is not compiled with GPU");
-#endif
-    }
-  }
-};
-
-template <typename T, int Rank>
-struct StridedMemcpyFunctor {
-  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
-                  framework::Dim<Rank> src_stride, framework::Dim<Rank> dst_dim,
-                  framework::Dim<Rank> dst_stride, T* dst) const {
-    for (int64_t i = 0; i < dst_dim.head; ++i) {
-      StridedMemcpyFunctor<T, Rank - 1> func;
-      func(dev_ctx, src, src_stride.tail, dst_dim.tail, dst_stride.tail, dst);
-      src += src_stride.head;
-      dst += dst_stride.head;
-    }
-  }
-};
-
-template <typename T>
-struct StridedCopyDimVisitor : public boost::static_visitor<void> {
-  StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src,
-                        const framework::DDim& src_stride,
-                        const framework::DDim& dst_stride, T* dst)
-      : dev_ctx_(dev_ctx),
-        src_(src),
-        src_stride_(src_stride),
-        dst_stride_(dst_stride),
-        dst_(dst) {}
-
-  template <typename Dim>
-  void operator()(Dim dst_dim) const {
-    Dim src_stride = boost::get<Dim>(src_stride_);
-    Dim dst_stride = boost::get<Dim>(dst_stride_);
-    constexpr int dim = Dim::dimensions;
-    StridedMemcpyFunctor<T, dim> functor;
-    functor(dev_ctx_, src_, src_stride, dst_dim, dst_stride, dst_);
-  }
-
-  const platform::DeviceContext& dev_ctx_;
-  const T* src_;
-  const framework::DDim& src_stride_;
-  const framework::DDim& dst_stride_;
-  T* dst_;
-};
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/detection_output_op.cc b/paddle/operators/detection_output_op.cc
deleted file mode 100644
index ea44cd32678d7e8a5836c1886cf9c1b4961970aa..0000000000000000000000000000000000000000
--- a/paddle/operators/detection_output_op.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/detection_output_op.h"
-namespace paddle {
-namespace operators {
-
-class DetectionOutputOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  DetectionOutputOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Loc",
-             "(Tensor) The input tensor of detection_output operator."
-             "The input predict locations"
-             "The format of input tensor is kNCHW. Where K is priorbox point "
-             "numbers,"
-             "N is How many boxes are there on each point, "
-             "C is 4, H and W both are 1.");
-    AddInput("Conf",
-             "(Tensor) The input tensor of detection_output operator."
-             "The input priorbox confidence."
-             "The format of input tensor is kNCHW. Where K is priorbox point "
-             "numbers,"
-             "N is How many boxes are there on each point, "
-             "C is the number of classes, H and W both are 1.");
-    AddInput("PriorBox",
-             "(Tensor) The input tensor of detection_output operator."
-             "The format of input tensor is the position and variance "
-             "of the boxes");
-    AddOutput("Out",
-              "(Tensor) The output tensor of detection_output operator.");
-    AddAttr<int>("background_label_id", "(int), The background class index.");
-    AddAttr<int>("num_classes", "(int), The number of the classification.");
-    AddAttr<float>("nms_threshold",
-                   "(float), The Non-maximum suppression threshold.");
-    AddAttr<float>("confidence_threshold",
-                   "(float), The classification confidence threshold.");
-    AddAttr<int>("top_k", "(int), The bbox number kept of the layer’s output.");
-    AddAttr<int>("nms_top_k",
-                 "(int), The bbox number kept of the NMS’s output.");
-    AddComment(R"DOC(
-          detection output for SSD(single shot multibox detector)
-          Apply the NMS to the output of network and compute the predict
-          bounding box location. The output’s shape of this layer could
-          be zero if there is no valid bounding box.
-        )DOC");
-  }
-};
-
-class DetectionOutputOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Loc"),
-                   "Input(X) of DetectionOutputOp"
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Conf"),
-                   "Input(X) of DetectionOutputOp"
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
-                   "Input(X) of DetectionOutputOp"
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of DetectionOutputOp should not be null.");
-    std::vector<int64_t> output_shape({1, 7});
-    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(detection_output, ops::DetectionOutputOp,
-                             ops::DetectionOutputOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    detection_output,
-    ops::DetectionOutputKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DetectionOutputKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/detection_output_op.cu.cc b/paddle/operators/detection_output_op.cu.cc
deleted file mode 100644
index 4a6560e0492c559afd06e5152c34fab545b7ce61..0000000000000000000000000000000000000000
--- a/paddle/operators/detection_output_op.cu.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/detection_output_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    detection_output,
-    ops::DetectionOutputKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DetectionOutputKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/detection_output_op.h b/paddle/operators/detection_output_op.h
deleted file mode 100644
index 86285b748a7fe57437ddaa8cc4262d9ac1b0403a..0000000000000000000000000000000000000000
--- a/paddle/operators/detection_output_op.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/operators/math/detection_util.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/softmax.h"
-#include "paddle/operators/strided_memcpy.h"
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-inline void transpose_fun(const framework::ExecutionContext& context,
-                          const framework::Tensor& src,
-                          framework::Tensor* dst) {
-  int input_nums = src.dims()[0];
-  int offset = 0;
-  for (int j = 0; j < input_nums; ++j) {
-    framework::Tensor in_p_tensor = src.Slice(j, j + 1);
-    std::vector<int64_t> shape_vec(
-        {in_p_tensor.dims()[0], in_p_tensor.dims()[1], in_p_tensor.dims()[3],
-         in_p_tensor.dims()[4], in_p_tensor.dims()[2]});
-    framework::DDim shape(framework::make_ddim(shape_vec));
-    framework::Tensor in_p_tensor_transpose;
-    in_p_tensor_transpose.mutable_data<T>(shape, context.GetPlace());
-    std::vector<int> shape_axis({0, 1, 3, 4, 2});
-    math::Transpose<DeviceContext, T, 5> trans5;
-    trans5(context.template device_context<DeviceContext>(), in_p_tensor,
-           &in_p_tensor_transpose, shape_axis);
-    auto dst_stride = framework::stride(dst->dims());
-    auto src_stride = framework::stride(in_p_tensor_transpose.dims());
-    StridedMemcpy<T>(context.device_context(), in_p_tensor_transpose.data<T>(),
-                     src_stride, in_p_tensor_transpose.dims(), dst_stride,
-                     dst->data<T>() + offset);
-    offset += in_p_tensor_transpose.dims()[4] * src_stride[4];
-  }
-}
-template <typename DeviceContext, typename T>
-class DetectionOutputKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_loc = context.Input<framework::Tensor>("Loc");
-    const framework::Tensor* in_conf = context.Input<framework::Tensor>("Conf");
-    const framework::Tensor* in_priorbox =
-        context.Input<framework::Tensor>("PriorBox");
-    auto* out = context.Output<framework::Tensor>("Out");
-    int num_classes = context.template Attr<int>("num_classes");
-    int top_k = context.template Attr<int>("top_k");
-    int nms_top_k = context.template Attr<int>("nms_top_k");
-    int background_label_id = context.template Attr<int>("background_label_id");
-    float nms_threshold = context.template Attr<float>("nms_threshold");
-    float confidence_threshold =
-        context.template Attr<float>("confidence_threshold");
-    size_t batch_size = in_conf->dims()[1];
-    int conf_sum_size = in_conf->numel();
-    // for softmax
-    std::vector<int64_t> conf_shape_softmax_vec(
-        {conf_sum_size / num_classes, num_classes});
-    framework::DDim conf_shape_softmax(
-        framework::make_ddim(conf_shape_softmax_vec));
-    // for knchw => nhwc
-    std::vector<int64_t> loc_shape_vec({1, in_loc->dims()[1], in_loc->dims()[3],
-                                        in_loc->dims()[4],
-                                        in_loc->dims()[2] * in_loc->dims()[0]});
-    std::vector<int64_t> conf_shape_vec(
-        {1, in_conf->dims()[1], in_conf->dims()[3], in_conf->dims()[4],
-         in_conf->dims()[2] * in_conf->dims()[0]});
-    framework::DDim loc_shape(framework::make_ddim(loc_shape_vec));
-    framework::DDim conf_shape(framework::make_ddim(conf_shape_vec));
-    framework::Tensor loc_tensor;
-    framework::Tensor conf_tensor;
-    loc_tensor.mutable_data<T>(loc_shape, context.GetPlace());
-    conf_tensor.mutable_data<T>(conf_shape, context.GetPlace());
-    // for cpu
-    framework::Tensor loc_cpu;
-    framework::Tensor conf_cpu;
-    framework::Tensor priorbox_cpu;
-    const T* priorbox_data = in_priorbox->data<T>();
-    transpose_fun<DeviceContext, T>(context, *in_loc, &loc_tensor);
-    transpose_fun<DeviceContext, T>(context, *in_conf, &conf_tensor);
-    conf_tensor.Resize(conf_shape_softmax);
-    math::SoftmaxFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), &conf_tensor,
-        &conf_tensor);
-    T* loc_data = loc_tensor.data<T>();
-    T* conf_data = conf_tensor.data<T>();
-    if (platform::is_gpu_place(context.GetPlace())) {
-      loc_cpu.mutable_data<T>(loc_tensor.dims(), platform::CPUPlace());
-      framework::Copy(loc_tensor, platform::CPUPlace(),
-                      context.device_context(), &loc_cpu);
-      loc_data = loc_cpu.data<T>();
-      conf_cpu.mutable_data<T>(conf_tensor.dims(), platform::CPUPlace());
-      framework::Copy(conf_tensor, platform::CPUPlace(),
-                      context.device_context(), &conf_cpu);
-      conf_data = conf_cpu.data<T>();
-      priorbox_cpu.mutable_data<T>(in_priorbox->dims(), platform::CPUPlace());
-      framework::Copy(*in_priorbox, platform::CPUPlace(),
-                      context.device_context(), &priorbox_cpu);
-      priorbox_data = priorbox_cpu.data<T>();
-    }
-    // get decode bboxes
-    size_t num_priors = in_priorbox->numel() / 8;
-    std::vector<std::vector<operators::math::BBox<T>>> all_decoded_bboxes;
-    for (size_t n = 0; n < batch_size; ++n) {
-      std::vector<operators::math::BBox<T>> decoded_bboxes;
-      for (size_t i = 0; i < num_priors; ++i) {
-        size_t prior_offset = i * 8;
-        size_t loc_pred_offset = n * num_priors * 4 + i * 4;
-        std::vector<math::BBox<T>> prior_bbox_vec;
-        math::GetBBoxFromPriorData<T>(priorbox_data + prior_offset, 1,
-                                      prior_bbox_vec);
-        std::vector<std::vector<T>> prior_bbox_var;
-        math::GetBBoxVarFromPriorData<T>(priorbox_data + prior_offset, 1,
-                                         prior_bbox_var);
-        std::vector<T> loc_pred_data;
-        for (size_t j = 0; j < 4; ++j)
-          loc_pred_data.push_back(*(loc_data + loc_pred_offset + j));
-        math::BBox<T> bbox = math::DecodeBBoxWithVar<T>(
-            prior_bbox_vec[0], prior_bbox_var[0], loc_pred_data);
-        decoded_bboxes.push_back(bbox);
-      }
-      all_decoded_bboxes.push_back(decoded_bboxes);
-    }
-    std::vector<std::map<size_t, std::vector<size_t>>> all_indices;
-    int num_kept = math::GetDetectionIndices<T>(
-        conf_data, num_priors, num_classes, background_label_id, batch_size,
-        confidence_threshold, nms_top_k, nms_threshold, top_k,
-        all_decoded_bboxes, &all_indices);
-
-    if (num_kept <= 0) {
-      std::vector<int64_t> out_shape_vec({0, 0});
-      framework::DDim out_shape(framework::make_ddim(out_shape_vec));
-      out->Resize(out_shape);
-      return;
-    }
-    std::vector<int64_t> out_shape_vec({num_kept, 7});
-    framework::DDim out_shape(framework::make_ddim(out_shape_vec));
-    out->mutable_data<T>(out_shape, context.GetPlace());
-    framework::Tensor out_cpu;
-    T* out_data = out->data<T>();
-    if (platform::is_gpu_place(context.GetPlace())) {
-      out_cpu.mutable_data<T>(out->dims(), platform::CPUPlace());
-      out_data = out_cpu.data<T>();
-    }
-    math::GetDetectionOutput<T>(conf_data, num_kept, num_priors, num_classes,
-                                batch_size, all_indices, all_decoded_bboxes,
-                                out_data);
-    if (platform::is_gpu_place(context.GetPlace())) {
-      framework::Copy(out_cpu, platform::CUDAPlace(), context.device_context(),
-                      out);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
deleted file mode 100644
index 5274aa204e6629c9c5ea850c433e0948c89015bd..0000000000000000000000000000000000000000
--- a/paddle/operators/dropout_op.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/dropout_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class DropoutOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", x_dims);
-    if (ctx->Attrs().Get<bool>("is_test") == false) {
-      ctx->SetOutputDim("Mask", x_dims);
-    }
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-template <typename AttrType>
-class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input of dropout op.");
-    AddOutput("Out", "The output of dropout op.");
-    AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate();
-
-    AddAttr<float>("dropout_prob", "Probability of setting units to zero.")
-        .SetDefault(.5f)
-        .AddCustomChecker([](const float& drop_p) {
-          PADDLE_ENFORCE(drop_p >= 0.0f && drop_p <= 1.0f,
-                         "'dropout_prob' must be between 0.0 and 1.0.");
-        });
-    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
-    AddAttr<bool>("fix_seed",
-                  "A flag indicating whether to use a fixed seed to generate "
-                  "random mask. NOTE: DO NOT set this flag to true in "
-                  "training. Setting this flag to true is only useful in "
-                  "unittest or for debug that always the same output units "
-                  "will be dropped.")
-        .SetDefault(false);
-    AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
-
-    AddComment(R"DOC(
-Dropout Operator.
-
-Dropout refers to randomly dropping out units in a nerual network. It is a
-regularization technique for reducing overfitting by preventing neuron
-co-adaption during training. The dropout operator randomly set (according to
-the given dropout probability) the outputs of some units to zero, while others
-are set equal to their corresponding inputs.
-
-)DOC");
-  }
-};
-
-template <typename AttrType>
-class DropoutOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
-                      "GradOp is only callable when is_test is false");
-
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Mask"), "Mask must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) must not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(x_dims, out_dims,
-                      "Dimensions of Input(X) and Out@Grad must be the same.");
-    auto mask_dims = ctx->GetInputDim("Mask");
-    PADDLE_ENFORCE_EQ(x_dims, mask_dims,
-                      "Dimensions of Input(X) and Mask must be the same.");
-
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker<float>, dropout_grad,
-            ops::DropoutOpGrad<float>);
-REGISTER_OP_CPU_KERNEL(
-    dropout,
-    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float, float>);
-REGISTER_OP_CPU_KERNEL(
-    dropout_grad,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu
deleted file mode 100644
index 84d78445a4fa340ba3c066bb48b96b2a890db652..0000000000000000000000000000000000000000
--- a/paddle/operators/dropout_op.cu
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include "paddle/operators/dropout_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename AttrType>
-struct MaskGenerator {
-  AttrType dropout_prob;
-  int seed;
-
-  __host__ __device__ MaskGenerator(AttrType dropout_prob, int seed)
-      : dropout_prob(dropout_prob), seed(seed) {}
-
-  inline __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed);
-    thrust::uniform_real_distribution<AttrType> dist(0, 1);
-    rng.discard(n);
-    if (dist(rng) < dropout_prob) {
-      return static_cast<T>(0);
-    }
-    return static_cast<T>(1);
-  }
-};
-
-// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
-// Use std::random and thrust::random(thrust is a std library in CUDA) to
-// implement uniform random.
-template <typename Place, typename T, typename AttrType>
-class GPUDropoutKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Output<Tensor>("Out");
-    y->mutable_data<T>(context.GetPlace());
-    AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
-
-    auto X = EigenMatrix<T>::Reshape(*x, 1);
-    auto Y = EigenMatrix<T>::Reshape(*y, 1);
-
-    auto& place = *context.template device_context<Place>().eigen_device();
-    if (!context.Attr<bool>("is_test")) {
-      auto* mask = context.Output<Tensor>("Mask");
-      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
-      int size = framework::product(mask->dims());
-
-      std::random_device rnd;
-      int seed =
-          context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
-
-      thrust::counting_iterator<unsigned int> index_sequence_begin(0);
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<T>(mask_data),
-                        MaskGenerator<T, AttrType>(dropout_prob, seed));
-      auto M = EigenMatrix<T>::Reshape(*mask, 1);
-      Y.device(place) = X * M;
-    } else {
-      Y.device(place) = X * (1.0f - dropout_prob);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    dropout,
-    ops::GPUDropoutKernel<paddle::platform::CUDADeviceContext, float, float>);
-REGISTER_OP_CUDA_KERNEL(
-    dropout_grad,
-    ops::DropoutGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h
deleted file mode 100644
index 46e5dbc64ff9ad3d04a9c1c07f4226932f661baf..0000000000000000000000000000000000000000
--- a/paddle/operators/dropout_op.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <random>
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T, typename AttrType>
-class CPUDropoutKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Output<Tensor>("Out");
-    const auto* x_data = x->data<T>();
-    auto* y_data = y->mutable_data<T>(context.GetPlace());
-    float dropout_prob = context.Attr<float>("dropout_prob");
-
-    if (!context.Attr<bool>("is_test")) {
-      auto* mask = context.Output<Tensor>("Mask");
-      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
-
-      // NOTE: fixed seed should only be used in unittest or for debug.
-      // Guarantee to use random seed in training.
-      std::random_device rnd;
-      std::minstd_rand engine;
-      int seed =
-          context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
-      engine.seed(seed);
-
-      std::uniform_real_distribution<float> dist(0, 1);
-      size_t size = framework::product(mask->dims());
-      for (size_t i = 0; i < size; ++i) {
-        if (dist(engine) < dropout_prob) {
-          mask_data[i] = 0;
-          y_data[i] = 0;
-        } else {
-          mask_data[i] = 1;
-          y_data[i] = x_data[i];
-        }
-      }
-    } else {
-      auto X = EigenMatrix<T>::Reshape(*x, 1);
-      auto Y = EigenMatrix<T>::Reshape(*y, 1);
-      auto& place =
-          *context.template device_context<DeviceContext>().eigen_device();
-      Y.device(place) = X * (1.0f - dropout_prob);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DropoutGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(!context.Attr<bool>("is_test"),
-                   "GradOp is only callable when is_test is false");
-
-    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* mask = context.Input<Tensor>("Mask");
-    grad_x->mutable_data<T>(context.GetPlace());
-
-    auto M = EigenMatrix<T>::Reshape(*mask, 1);
-    auto dX = EigenMatrix<T>::Reshape(*grad_x, 1);
-    auto dY = EigenMatrix<T>::Reshape(*grad_y, 1);
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    dX.device(place) = dY * M;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/edit_distance_op.cc b/paddle/operators/edit_distance_op.cc
deleted file mode 100644
index 7e7dfc79eba5c9a75366415e5f4b3183653a5cc6..0000000000000000000000000000000000000000
--- a/paddle/operators/edit_distance_op.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/edit_distance_op.h"
-
-namespace paddle {
-namespace operators {
-
-class EditDistanceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Hyps"), "Input(Hyps) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Refs"), "Input(Refs) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("SequenceNum"),
-                   "Output(SequenceNum) shouldn't be null.");
-    auto hyp_dims = ctx->GetInputDim("Hyps");
-    auto ref_dims = ctx->GetInputDim("Refs");
-    PADDLE_ENFORCE(hyp_dims.size() == 2 && hyp_dims[1] == 1,
-                   "Input(Hyps) must be a 2-D LoDTensor with the 2nd dimension "
-                   "equal to 1.");
-    PADDLE_ENFORCE(ref_dims.size() == 2 && ref_dims[1] == 1,
-                   "Input(Refs) must be a 2-D LoDTensor with the 2nd dimension "
-                   "equal to 1.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("Refs"));
-    ctx->SetOutputDim("SequenceNum", {1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(framework::proto::DataType::FP32,
-                                   ctx.device_context());
-  }
-};
-
-class EditDistanceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  EditDistanceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Hyps",
-             "(2-D LoDTensor<int64_t>, 2nd dim. equal to 1) "
-             "The indices for hypothesis strings.");
-    AddInput("Refs",
-             "(2-D LoDTensor<int64_t>, 2nd dim. equal to 1) "
-             "The indices for reference strings.");
-    AddOutput("SequenceNum", "The sequence count of current batch");
-    AddAttr<bool>("normalized",
-                  "(bool, default false) Indicated whether to normalize "
-                  "the edit distance by the length of reference string.")
-        .SetDefault(false);
-    AddOutput("Out",
-              "(2-D Tensor with shape [`batch_size` x 1]) "
-              "The output edit distances of EditDistance operator.");
-    AddComment(R"DOC(
-
-EditDistance operator computes the edit distances between a batch of hypothesis
-strings and their references.
-
-Edit distance, also called Levenshtein distance, measures how dissimilar two strings
-are by counting the minimum number of operations to transform one string into anthor.
-Here the operations include insertion, deletion, and substitution. For example,
-given hypothesis string A = "kitten" and reference B = "sitting", the edit distance
-is 3 for A will be transformed into B at least after two substitutions and one
-insertion:
-
-   "kitten" -> "sitten" -> "sittin" -> "sitting"
-
-Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with the total
-number denoted by `batch_size`, and the separation is specified by the LoD information.
-And the `batch_size` reference strings are arranged in order in the same way in the
-LoDTensor Input(Refs).
-
-Output(Out) contains the `batch_size` results and each stands for the edit stance
-for a pair of strings respectively. If Attr(normalized) is true, the edit distance
-will be divided by the length of reference string.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(edit_distance, ops::EditDistanceOp, ops::EditDistanceOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    edit_distance, ops::EditDistanceKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/edit_distance_op.cu b/paddle/operators/edit_distance_op.cu
deleted file mode 100644
index c3e116af086627d576c7a788caebd45667d70017..0000000000000000000000000000000000000000
--- a/paddle/operators/edit_distance_op.cu
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/platform/cuda_helper.h"
-#include "paddle/platform/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <typename T>
-__global__ void FillFirstRow(T* dist, const int N) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  if (idx < N + 1) {
-    dist[idx] = idx;
-  }
-}
-
-template <typename T>
-__global__ void FillFirstColumn(T* dist, const int M, const int N) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  if (idx < M + 1) {
-    dist[idx * (N + 1)] = idx;
-  }
-}
-
-template <typename T>
-__global__ void Levenshtein(T* dist, const int64_t* x1, const int64_t* x2,
-                            const int M, const int N, const int start) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  int offset = N;
-  int index = start + idx * offset;
-  int row = index / (N + 1);
-  int col = index % (N + 1);
-  if (row > 0 && col > 0 && row < M + 1 && col < N + 1) {
-    int cost = x1[row - 1] == x2[col - 1] ? 0 : 1;
-    int dels = dist[(row - 1) * (N + 1) + col] + 1;
-    int ins = dist[row * (N + 1) + col - 1] + 1;
-    int subs = dist[(row - 1) * (N + 1) + (col - 1)] + cost;
-    dist[index] = min(dels, min(ins, subs));
-  }
-}
-
-template <typename T>
-__global__ void SetOutput(T* out, const T* dist, const int M, const int N,
-                          bool normalized) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  if (idx == 0) {
-    out[0] = normalized ? dist[M * (N + 1) + N] / N : dist[M * (N + 1) + N];
-  }
-}
-
-template <typename Place, typename T>
-class EditDistanceGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_t = ctx.Output<framework::Tensor>("Out");
-
-    auto* x1_t = ctx.Input<framework::LoDTensor>("Hyps");
-    auto* x2_t = ctx.Input<framework::LoDTensor>("Refs");
-    auto* sequence_num = ctx.Output<framework::Tensor>("SequenceNum");
-    sequence_num->mutable_data<int64_t>(ctx.GetPlace());
-
-    auto normalized = ctx.Attr<bool>("normalized");
-    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
-                      ctx.device_context())
-                      .stream();
-
-    auto hyp_lod = x1_t->lod()[0];
-    auto ref_lod = x2_t->lod()[0];
-    PADDLE_ENFORCE(
-        hyp_lod.size() == ref_lod.size(),
-        "Input(Hyps) and Input(Refs) must have the same batch size.");
-    for (size_t i = 1; i < ref_lod.size(); ++i) {
-      PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1],
-                     "Reference string %d is empty.", i);
-    }
-
-    const size_t num_strs = hyp_lod.size() - 1;
-    math::SetConstant<platform::CUDADeviceContext, int64_t> set_constant;
-    set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
-                 sequence_num, static_cast<int64_t>(num_strs));
-
-    out_t->Resize({static_cast<int64_t>(num_strs), 1});
-    out_t->mutable_data<T>(ctx.GetPlace());
-    auto out = out_t->data<T>();
-
-    T distance = 0.0;
-    for (size_t num = 0; num < num_strs; num++) {
-      auto m = static_cast<int64_t>(hyp_lod[num + 1] - hyp_lod[num]);
-      auto n = static_cast<int64_t>(ref_lod[num + 1] - ref_lod[num]);
-      if (m == 0 || n == 0) {
-        distance = std::max(m, n);
-        if (normalized) {
-          PADDLE_ENFORCE(n > 0,
-                         "The reference string (#%d) cannot be empty "
-                         "when Attr(normalized) is enabled.",
-                         n);
-          distance = distance / n;
-        }
-        memory::Copy(boost::get<Place>(ctx.GetPlace()), out + num,
-                     platform::CPUPlace(), &distance, sizeof(T), stream);
-      } else {
-        framework::Tensor dist_t;
-        dist_t.Resize({m + 1, n + 1});
-        dist_t.mutable_data<T>(ctx.GetPlace());
-        auto dist = dist_t.data<T>();
-        auto x1 = x1_t->data<int64_t>() + hyp_lod[num];
-        auto x2 = x2_t->data<int64_t>() + ref_lod[num];
-
-        FillFirstColumn<T><<<1 + m / PADDLE_CUDA_NUM_THREADS,
-                             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, m, n);
-
-        FillFirstRow<T><<<1 + n / PADDLE_CUDA_NUM_THREADS,
-                          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, n);
-        // Compute the elements of distance matrix in the anti-diagonal diretion
-        for (int64_t slice = 2; slice < m + n + 1; ++slice) {
-          int z_m = slice < m + 1 ? 0 : slice - m;
-          int z_n = slice < n + 1 ? 0 : slice - n;
-          int size = slice - (z_m + z_n) + 1;  // number of elments in the same
-                                               // anti-diagonal line to update
-          // the start index at which computes from
-          int start = slice < n + 1 ? slice : (z_n + 1) * (n + 1) - 1;
-          Levenshtein<T><<<1 + (size - 1) / PADDLE_CUDA_NUM_THREADS,
-                           PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, x1, x2,
-                                                                 m, n, start);
-        }
-        SetOutput<T><<<1, 1, 0, stream>>>(out + num, dist, m, n, normalized);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    edit_distance,
-    ops::EditDistanceGPUKernel<paddle::platform::CUDAPlace, float>);
diff --git a/paddle/operators/edit_distance_op.h b/paddle/operators/edit_distance_op.h
deleted file mode 100644
index 974299e604d22422e9024382e85c843ad831575d..0000000000000000000000000000000000000000
--- a/paddle/operators/edit_distance_op.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-namespace paddle {
-namespace operators {
-
-template <typename Place, typename T>
-class EditDistanceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_t = ctx.Output<framework::Tensor>("Out");
-
-    auto* x1_t = ctx.Input<framework::LoDTensor>("Hyps");
-    auto* x2_t = ctx.Input<framework::LoDTensor>("Refs");
-    auto* sequence_num = ctx.Output<framework::Tensor>("SequenceNum");
-    int64_t* seq_num_data = sequence_num->mutable_data<int64_t>(ctx.GetPlace());
-
-    auto normalized = ctx.Attr<bool>("normalized");
-
-    auto hyp_lod = x1_t->lod()[0];
-    auto ref_lod = x2_t->lod()[0];
-    PADDLE_ENFORCE(
-        hyp_lod.size() == ref_lod.size(),
-        "Input(Hyps) and Input(Refs) must have the same batch size.");
-    for (size_t i = 1; i < ref_lod.size(); ++i) {
-      PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1],
-                     "Reference string %d is empty.", i);
-    }
-    auto num_strs = hyp_lod.size() - 1;
-    *seq_num_data = static_cast<int64_t>(num_strs);
-
-    out_t->Resize({static_cast<int64_t>(num_strs), 1});
-    out_t->mutable_data<float>(ctx.GetPlace());
-    auto out = out_t->data<T>();
-
-    T distance = 0.0;
-    for (size_t num = 0; num < num_strs; ++num) {
-      auto m = static_cast<int64_t>(hyp_lod[num + 1] - hyp_lod[num]);
-      auto n = static_cast<int64_t>(ref_lod[num + 1] - ref_lod[num]);
-
-      if (m == 0) {
-        distance = n;
-      } else if (n == 0) {
-        distance = m;
-      } else {
-        framework::Tensor dist_t;
-        dist_t.Resize({m + 1, n + 1});
-        dist_t.mutable_data<T>(ctx.GetPlace());
-        auto dist = dist_t.data<T>();
-        auto x1 = x1_t->data<int64_t>() + hyp_lod[num];
-        auto x2 = x2_t->data<int64_t>() + ref_lod[num];
-        for (int64_t i = 0; i < m + 1; ++i) {
-          dist[i * (n + 1)] = i;
-        }
-        for (int64_t j = 0; j < n + 1; ++j) {
-          dist[j] = j;
-        }
-        for (int64_t i = 1; i < m + 1; ++i) {
-          for (int64_t j = 1; j < n + 1; ++j) {
-            int cost = x1[i - 1] == x2[j - 1] ? 0 : 1;
-            int dels = dist[(i - 1) * (n + 1) + j] + 1;
-            int ins = dist[i * (n + 1) + (j - 1)] + 1;
-            int subs = dist[(i - 1) * (n + 1) + (j - 1)] + cost;
-            dist[i * (n + 1) + j] = std::min(dels, std::min(ins, subs));
-          }
-        }
-        distance = dist[m * (n + 1) + n];
-      }
-
-      if (normalized) {
-        PADDLE_ENFORCE(n > 0,
-                       "The reference string (#%d) cannot be empty "
-                       "when Attr(normalized) is enabled.",
-                       n);
-        distance = distance / n;
-      }
-      out[num] = distance;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/elementwise_add_op.cc b/paddle/operators/elementwise_add_op.cc
deleted file mode 100644
index 37951fa7587c8a200f4733e5a46575461f5026cd..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_add_op.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/elementwise_add_op.h"
-#include "paddle/operators/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-class ElementwiseAddOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwiseAddOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Add", "Out = X + Y");
-    AddComment(comment_);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker,
-            elementwise_add_grad, ops::ElementwiseOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_add,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_add_op.cu b/paddle/operators/elementwise_add_op.cu
deleted file mode 100644
index 641cea323acee549898cb6f0245ccac4c069ce32..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_add_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/elementwise_add_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_add,
-    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h
deleted file mode 100644
index c32288d6984f126f2374a13973541f4f663b25a4..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_add_op.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/operators/elementwise_op_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct AddFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseAddKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
-  }
-};
-
-template <typename T>
-struct ElementwiseAddGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = dz_e;
-    }
-  }
-};
-
-template <typename T>
-struct ElementwiseAddBroadCastGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = dz_e.reshape(Eigen::DSizes<int, 2>(pre, n))
-                           .sum(Eigen::array<int, 1>{{0}});
-    }
-  }
-};
-
-template <typename T>
-struct ElementwiseAddBroadCast2GradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N, typename Post>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
-                  Post post) {
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = dz_e.reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                           .sum(Eigen::array<int, 2>{{0, 2}});
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseAddGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseGradCompute<DeviceContext, T, ElementwiseAddGradFunctor<T>,
-                           ElementwiseAddBroadCastGradFunctor<T>,
-                           ElementwiseAddBroadCast2GradFunctor<T>>(
-        ctx, x, y, out, dout, axis, dx, dy);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/elementwise_div_op.cc b/paddle/operators/elementwise_div_op.cc
deleted file mode 100644
index 6ebd58b1b3dd79a465e70a24f7aab56261290bf6..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_div_op.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/elementwise_div_op.h"
-#include "paddle/operators/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-class ElementwiseDivOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwiseDivOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Div", "Out = X / Y");
-    AddComment(comment_);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker,
-            elementwise_div_grad, ops::ElementwiseOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_div_op.cu b/paddle/operators/elementwise_div_op.cu
deleted file mode 100644
index a0372123d6ffe7e3a90727ddd37a787c1030e0bd..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_div_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/elementwise_div_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h
deleted file mode 100644
index 07ebade31ff5b3d5c89156e28ff5fa0670a9a842..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_div_op.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/operators/elementwise_op_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct DivFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a / b; }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseDivKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
-  }
-};
-
-template <typename T>
-struct ElementwiseDivGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto z_e = framework::EigenVector<T>::Flatten(*z);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e / y_e;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = -1.0 * dz_e * z_e / y_e;
-    }
-  }
-};
-
-template <typename T>
-struct ElementwiseDivBroadCastGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e / y_e_bcast;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
-                           .reshape(Eigen::DSizes<int, 2>(pre, n))
-                           .sum(Eigen::array<int, 1>{{0}});
-    }
-  }
-};
-
-template <typename T>
-struct ElementwiseDivBroadCast2GradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N, typename Post>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
-                  Post post) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e / y_e_bcast;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
-                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                           .sum(Eigen::array<int, 2>{{0, 2}});
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseDivGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseGradCompute<DeviceContext, T, ElementwiseDivGradFunctor<T>,
-                           ElementwiseDivBroadCastGradFunctor<T>,
-                           ElementwiseDivBroadCast2GradFunctor<T>>(
-        ctx, x, y, out, dout, axis, dx, dy);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/elementwise_max_op.cc b/paddle/operators/elementwise_max_op.cc
deleted file mode 100644
index 53c27ae5be4cbfe85ce61aa27196594ae152eea4..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_max_op.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/elementwise_max_op.h"
-#include "paddle/operators/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-class ElementwiseMaxOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwiseMaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Max", "Out = max(X, Y)");
-    AddComment(comment_);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(elementwise_max, ops::ElementwiseOp, ops::ElementwiseMaxOpMaker,
-            elementwise_max_grad, ops::ElementwiseOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_max,
-    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_max_grad,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_max_op.cu b/paddle/operators/elementwise_max_op.cu
deleted file mode 100644
index 5ff4af17477cbd35b765cc00d46c95fda620e2df..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_max_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/elementwise_max_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_max,
-    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_max_grad,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
diff --git a/paddle/operators/elementwise_max_op.h b/paddle/operators/elementwise_max_op.h
deleted file mode 100644
index 717e45ab31db9b9a6629fb33e17654dbf986d8c5..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_max_op.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/operators/elementwise_op_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct MaxFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a > b ? a : b; }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
-  }
-};
-
-template <typename T>
-struct ElementwiseMaxGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = (x_e > y_e).template cast<T>() * dz_e;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (x_e <= y_e).template cast<T>() * dz_e;
-    }
-  }
-};
-
-template <typename T>
-struct ElementwiseMaxBroadCastGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = (x_e > y_e_bcast).template cast<T>() * dz_e;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = ((x_e <= y_e_bcast).template cast<T>() * dz_e)
-                           .reshape(Eigen::DSizes<int, 2>(pre, n))
-                           .sum(Eigen::array<int, 1>{{0}});
-    }
-  }
-};
-
-template <typename T>
-struct ElementwiseMaxBroadCast2GradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N, typename Post>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
-                  Post post) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = (x_e > y_e_bcast).template cast<T>() * dz_e;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = ((x_e <= y_e_bcast).template cast<T>() * dz_e)
-                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                           .sum(Eigen::array<int, 2>{{0, 2}});
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMaxGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseGradCompute<DeviceContext, T, ElementwiseMaxGradFunctor<T>,
-                           ElementwiseMaxBroadCastGradFunctor<T>,
-                           ElementwiseMaxBroadCast2GradFunctor<T>>(
-        ctx, x, y, out, dout, axis, dx, dy);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/elementwise_min_op.cc b/paddle/operators/elementwise_min_op.cc
deleted file mode 100644
index 99482e1bf60c88062087c5fe0105e90aa0a8677c..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_min_op.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/elementwise_min_op.h"
-#include "paddle/operators/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-class ElementwiseMinOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwiseMinOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Max", "Out = min(X, Y)");
-    AddComment(comment_);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(elementwise_min, ops::ElementwiseOp, ops::ElementwiseMinOpMaker,
-            elementwise_min_grad, ops::ElementwiseOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_min,
-    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_min_grad,
-    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_min_op.cu b/paddle/operators/elementwise_min_op.cu
deleted file mode 100644
index 3547e6ccb77177002b1ecbee4e4604b602f72209..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_min_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/elementwise_min_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_min,
-    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_min_grad,
-    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
diff --git a/paddle/operators/elementwise_min_op.h b/paddle/operators/elementwise_min_op.h
deleted file mode 100644
index 0de9a91c52b0ab82cd62604de318ce68e56b767d..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_min_op.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/operators/elementwise_op_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct MinFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a < b ? a : b; }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMinKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
-  }
-};
-
-template <typename T>
-struct ElementwiseMinGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = (x_e < y_e).template cast<T>() * dz_e;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (x_e >= y_e).template cast<T>() * dz_e;
-    }
-  }
-};
-
-template <typename T>
-struct ElementwiseMinBroadCastGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = (x_e < y_e_bcast).template cast<T>() * dz_e;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = ((x_e >= y_e_bcast).template cast<T>() * dz_e)
-                           .reshape(Eigen::DSizes<int, 2>(pre, n))
-                           .sum(Eigen::array<int, 1>{{0}});
-    }
-  }
-};
-
-template <typename T>
-struct ElementwiseMinBroadCast2GradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N, typename Post>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
-                  Post post) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = (x_e < y_e_bcast).template cast<T>() * dz_e;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = ((x_e >= y_e_bcast).template cast<T>() * dz_e)
-                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                           .sum(Eigen::array<int, 2>{{0, 2}});
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMinGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseGradCompute<DeviceContext, T, ElementwiseMinGradFunctor<T>,
-                           ElementwiseMinBroadCastGradFunctor<T>,
-                           ElementwiseMinBroadCast2GradFunctor<T>>(
-        ctx, x, y, out, dout, axis, dx, dy);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
deleted file mode 100644
index 450dd05c796e22794315274b73398e85c8145940..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_mul_op.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/elementwise_mul_op.h"
-#include "paddle/operators/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ElementwiseMulOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwiseMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Mul", "Out = X \\odot\\ Y");
-    AddComment(comment_);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
-            elementwise_mul_grad, ops::ElementwiseOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul,
-    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_mul_op.cu b/paddle/operators/elementwise_mul_op.cu
deleted file mode 100644
index f73e8afda960a89aff8568eab66b0f120db2e342..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_mul_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/elementwise_mul_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h
deleted file mode 100644
index ae7a71e0244dfb8ad3e55683ac081f92bc36bea5..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_mul_op.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/operators/elementwise_op_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct MulFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
-  }
-};
-
-template <typename T>
-struct ElementwiseMulGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e * y_e;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = x_e * dz_e;
-    }
-  }
-};
-
-template <typename T>
-struct ElementwiseMulBroadCastGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e * y_e_bcast;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (x_e * dz_e)
-                           .reshape(Eigen::DSizes<int, 2>(pre, n))
-                           .sum(Eigen::array<int, 1>{{0}});
-    }
-  }
-};
-
-template <typename T>
-struct ElementwiseMulBroadCast2GradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N, typename Post>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
-                  Post post) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e * y_e_bcast;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (x_e * dz_e)
-                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                           .sum(Eigen::array<int, 2>{{0, 2}});
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseGradCompute<DeviceContext, T, ElementwiseMulGradFunctor<T>,
-                           ElementwiseMulBroadCastGradFunctor<T>,
-                           ElementwiseMulBroadCast2GradFunctor<T>>(
-        ctx, x, y, out, dout, axis, dx, dy);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/elementwise_op.h b/paddle/operators/elementwise_op.h
deleted file mode 100644
index 1a0131d8b943da3deebd0c461f78cb02b34e6dc2..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_op.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-class ElementwiseOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  using Tensor = framework::Tensor;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of elementwise op should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"),
-                   "Input(Y) of elementwise op should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of elementwise op should not be null.");
-
-    auto x_dim = ctx->GetInputDim("X");
-    auto y_dim = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
-                      "Rank of first input must >= rank of second input.");
-    ctx->SetOutputDim("Out", x_dim);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ElementwiseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor), The first input tensor of elementwise op.");
-    AddInput("Y", "(Tensor), The second input tensor of elementwise op.");
-    AddOutput("Out", "The output of elementwise op.");
-    AddAttr<int>("axis",
-                 "(int, default -1). The start dimension index "
-                 "for broadcasting Y onto X.")
-        .SetDefault(-1)
-        .EqualGreaterThan(-1);
-    comment_ = R"DOC(
-Limited Elementwise {name} Operator.
-
-The equation is:
-
-$${equation}$$
-
-$X$ is a tensor of any dimension and the dimensions of tensor $Y$ must be
-smaller than or equal to the dimensions of $X$.
-
-There are two cases for this operator:
-1. The shape of $Y$ is same with $X$;
-2. The shape of $Y$ is a subset of $X$.
-
-For case 2:
-$Y$ will be broadcasted to match the shape of $X$ and axis should be
-set to index of the start dimension to broadcast $Y$ onto $X$.
-
-For example
-  .. code-block:: python
-
-    shape(X) = (2, 3, 4, 5), shape(Y) = (,)
-    shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
-    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
-    shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
-    shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
-
-Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details)
-information. However, the output only shares the LoD information with input $X$.
-
-)DOC";
-    AddComment(comment_);
-  }
-
- protected:
-  std::string comment_;
-
-  void Replace(std::string& src, std::string from, std::string to) {
-    std::size_t len_from = std::strlen(from.c_str());
-    std::size_t len_to = std::strlen(to.c_str());
-    for (std::size_t pos = src.find(from); pos != std::string::npos;
-         pos = src.find(from, pos + len_to)) {
-      src.replace(pos, len_from, to);
-    }
-  }
-
-  void SetComment(std::string name, std::string equation) {
-    Replace(comment_, "{name}", name);
-    Replace(comment_, "{equation}", equation);
-  }
-};
-
-class ElementwiseOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = framework::Tensor;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                      "Rank of first input must >= rank of second input.");
-
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, y_dims);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h
deleted file mode 100644
index 213fe1f5a818873e8b666464cb112637261c598c..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_op_function.h
+++ /dev/null
@@ -1,406 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-#include "paddle/platform/transform.h"
-
-#ifdef __NVCC__
-#include <thrust/iterator/iterator_adaptor.h>
-#endif
-
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-/*
- * Out = X ⊙ Y
- * If Y's shape does not match X' shape, they will be reshaped.
- * For example:
- * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
- *    pre=2, n=3*4, post=5
- *    x.shape(2, 12, 5) * y.shape(1,12,1).broadcast(2,12,5)
- * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
- *    pre=2*3, n=4*5, post=1
- *    x.shape(2, 3, 20) * y.shape(1,1,20).broadcast(2,3,20)
- */
-inline void get_mid_dims(const framework::DDim& x_dims,
-                         const framework::DDim& y_dims, const int axis,
-                         int& pre, int& n, int& post) {
-  pre = 1;
-  n = 1;
-  post = 1;
-  for (int i = 0; i < axis; ++i) {
-    pre *= x_dims[i];
-  }
-
-  for (int i = 0; i < y_dims.size(); ++i) {
-    PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i],
-                      "Broadcast dimension mismatch.");
-    n *= y_dims[i];
-  }
-
-  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
-    post *= x_dims[i];
-  }
-}
-
-template <typename T, typename DeviceContext>
-class RowwiseTransformIterator;
-template <typename T, typename DeviceContext>
-class MidWiseTransformIterator;
-
-template <typename T>
-class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
- public:
-  RowwiseTransformIterator(const T* ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
-
-  RowwiseTransformIterator<T, platform::CPUDeviceContext>& operator++() {
-    ++i_;
-    if (UNLIKELY(i_ == n_)) {
-      i_ = 0;
-    }
-    return *this;
-  }
-
-  bool operator==(const RowwiseTransformIterator<T, platform::CPUDeviceContext>&
-                      rhs) const {
-    return (ptr_ + i_) == &(*rhs);
-  }
-
-  bool operator!=(const RowwiseTransformIterator<T, platform::CPUDeviceContext>&
-                      rhs) const {
-    return (ptr_ + i_) != &(*rhs);
-  }
-
-  const T& operator*() { return ptr_[i_]; }
-
- private:
-  const T* ptr_;
-  int i_;
-  int64_t n_;
-};
-
-template <typename T>
-class MidWiseTransformIterator<T, platform::CPUDeviceContext> {
- public:
-  MidWiseTransformIterator(const T* ptr, int n, int post)
-      : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
-
-  MidWiseTransformIterator<T, platform::CPUDeviceContext>& operator++() {
-    ++j_;
-    if (UNLIKELY(j_ == post_)) {
-      ++i_;
-      j_ = 0;
-      if (UNLIKELY(i_ == n_)) {
-        i_ = 0;
-      }
-    }
-    return *this;
-  }
-
-  bool operator==(const MidWiseTransformIterator<T, platform::CPUDeviceContext>&
-                      rhs) const {
-    return (ptr_ + i_) == &(*rhs);
-  }
-
-  bool operator!=(const MidWiseTransformIterator<T, platform::CPUDeviceContext>&
-                      rhs) const {
-    return (ptr_ + i_) != &(*rhs);
-  }
-
-  const T& operator*() { return ptr_[i_]; }
-
- private:
-  const T* ptr_;
-  int64_t i_;
-  int64_t j_;
-  int64_t n_;
-  int64_t post_;
-};
-
-#ifdef __NVCC__
-template <typename T>
-class RowwiseTransformIterator<T, platform::CUDADeviceContext>
-    : public thrust::iterator_adaptor<
-          RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*> {
- public:
-  typedef thrust::iterator_adaptor<
-      RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
-      super_t;
-  HOSTDEVICE RowwiseTransformIterator(const T* x, int n)
-      : super_t(x), begin_(x), n_(n){};
-  friend class thrust::iterator_core_access;
-
- private:
-  unsigned int n_;
-  const T* begin_;
-  HOSTDEVICE typename super_t::reference dereference() const {
-    return *(begin_ + (this->base() - begin_) % n_);
-  }
-};
-
-template <typename T>
-class MidWiseTransformIterator<T, platform::CUDADeviceContext>
-    : public thrust::iterator_adaptor<
-          MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*> {
- public:
-  typedef thrust::iterator_adaptor<
-      MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
-      super_t;
-  HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post)
-      : super_t(x), begin_(x), n_(n), post_(post){};
-  friend class thrust::iterator_core_access;
-
- private:
-  unsigned int post_;
-  unsigned int n_;
-  const T* begin_;
-  HOSTDEVICE typename super_t::reference dereference() const {
-    return *(begin_ + (((this->base() - begin_) / post_) % n_));
-  }
-};
-#endif
-
-template <typename Functor, typename T, typename DeviceContext,
-          typename OutType = T>
-class TransformFunctor {
- public:
-  TransformFunctor(const framework::Tensor* x, const framework::Tensor* y,
-                   framework::Tensor* z, const DeviceContext& ctx, Functor func)
-      : x_(x->data<T>()),
-        y_(y->data<T>()),
-        z_(z->mutable_data<OutType>(ctx.GetPlace())),
-        nx_(x->numel()),
-        ctx_(ctx),
-        func_(func) {}
-
-  inline void Run() const {
-    platform::Transform<DeviceContext> trans;
-    trans(ctx_, x_, x_ + nx_, y_, z_, func_);
-  }
-
-  inline void RunRowWise(int n, int pre) const {
-    platform::Transform<DeviceContext> trans;
-    trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator<T, DeviceContext>(y_, n),
-          z_, func_);
-  }
-
-  inline void RunMidWise(int n, int pre, int post) const {
-    platform::Transform<DeviceContext> trans;
-    trans(ctx_, x_, x_ + nx_,
-          MidWiseTransformIterator<T, DeviceContext>(y_, n, post), z_, func_);
-  }
-
- private:
-  const T* x_;
-  const T* y_;
-  OutType* z_;
-  int64_t nx_;
-  const DeviceContext& ctx_;
-  Functor func_;
-};
-
-#define EIGEN_FUNCTOR(name, eigen_op)                                          \
-  struct Eigen##name##Functor {                                                \
-    template <typename DeviceContext, typename T>                              \
-    inline void Run(const framework::Tensor* x, const framework::Tensor* y,    \
-                    framework::Tensor* z,                                      \
-                    const framework::ExecutionContext& ctx) {                  \
-      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
-      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
-      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      z_e.device(                                                              \
-          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
-          eigen_op(x_e, y_e);                                                  \
-    }                                                                          \
-    template <typename DeviceContext, typename T>                              \
-    inline void RunBroadCast(const framework::Tensor* x,                       \
-                             const framework::Tensor* y, framework::Tensor* z, \
-                             const framework::ExecutionContext& ctx, int pre,  \
-                             int n) {                                          \
-      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
-      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
-      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))                  \
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))             \
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
-      z_e.device(                                                              \
-          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
-          eigen_op(x_e, y_bcast);                                              \
-    }                                                                          \
-    template <typename DeviceContext, typename T>                              \
-    inline void RunBroadCast2(const framework::Tensor* x,                      \
-                              const framework::Tensor* y,                      \
-                              framework::Tensor* z,                            \
-                              const framework::ExecutionContext& ctx, int pre, \
-                              int n, int post) {                               \
-      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
-      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
-      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))               \
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))       \
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
-      z_e.device(                                                              \
-          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
-          eigen_op(x_e, y_bcast);                                              \
-    }                                                                          \
-  }
-
-template <class functor, typename DeviceContext, typename T>
-void ElementwiseCompute(const framework::ExecutionContext& ctx) {
-  using Tensor = framework::Tensor;
-
-  auto* x = ctx.Input<Tensor>("X");
-  auto* y = ctx.Input<Tensor>("Y");
-  auto* z = ctx.Output<Tensor>("Out");
-  z->mutable_data<T>(ctx.GetPlace());
-
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                    "Rank of first input must >= rank of second input.");
-
-  if (x_dims == y_dims) {
-    functor f;
-    f.template Run<DeviceContext, T>(x, y, z, ctx);
-    return;
-  }
-
-  int axis = ctx.Attr<int>("axis");
-  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
-                 "Axis should be in range [0, x_dims)");
-
-  int pre, n, post;
-  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
-  if (post == 1) {
-    functor f;
-    f.template RunBroadCast<DeviceContext, T>(x, y, z, ctx, pre, n);
-    return;
-  } else {
-    functor f;
-    f.template RunBroadCast2<DeviceContext, T>(x, y, z, ctx, pre, n, post);
-    return;
-  }
-}
-
-#define EIGEN_ADD(x, y) ((x) + (y))
-EIGEN_FUNCTOR(Add, EIGEN_ADD);
-
-#define EIGEN_SUB(x, y) ((x) - (y))
-EIGEN_FUNCTOR(Sub, EIGEN_SUB);
-
-#define EIGEN_MUL(x, y) ((x) * (y))
-EIGEN_FUNCTOR(Mul, EIGEN_MUL);
-
-#define EIGEN_DIV(x, y) ((x) / (y))
-EIGEN_FUNCTOR(Div, EIGEN_DIV);
-
-template <typename DeviceContext, typename T, typename functor,
-          typename broadcastfunctor, typename broadcast2functor>
-void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
-
-                            const framework::Tensor* x,
-                            const framework::Tensor* y,
-                            const framework::Tensor* out,
-                            const framework::Tensor* dout, int axis,
-                            framework::Tensor* dx, framework::Tensor* dy) {
-  auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-
-  if (dx) {
-    dx->mutable_data<T>(ctx.GetPlace());
-  }
-  if (dy) {
-    dy->mutable_data<T>(ctx.GetPlace());
-  }
-
-  if (x_dims == y_dims) {
-    functor f;
-    f(place, x, y, out, dx, dy, dout);
-    return;
-  }
-
-  if (y_dims.size() == 1 && y_dims[0] == 1) {
-    // y is a scalar
-    auto extended_dims = framework::vectorize(x_dims);
-    extended_dims.push_back(1);
-    x_dims = framework::make_ddim(extended_dims);
-  }
-
-  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-
-  int pre, n, post;
-  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
-
-  if (post == 1) {
-    broadcastfunctor f;
-    f(place, x, y, out, dx, dy, dout, pre, n);
-    return;
-  } else {
-    broadcast2functor f;
-    f(place, x, y, out, dx, dy, dout, pre, n, post);
-    return;
-  }
-}
-
-template <typename Functor, typename DeviceContext, typename T,
-          typename OutType = T>
-void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
-                          const framework::Tensor* x,
-                          const framework::Tensor* y, int axis,
-                          framework::Tensor* z) {
-  TransformFunctor<Functor, T, DeviceContext, OutType> functor(
-      x, y, z, ctx.template device_context<DeviceContext>(), Functor());
-
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                    "Rank of first input must >= rank of second input.");
-
-  if (x_dims == y_dims) {
-    functor.Run();
-    return;
-  }
-
-  if (y_dims.size() == 1 && y_dims[0] == 1) {
-    // y is a scalar
-    auto extended_dims = framework::vectorize(x_dims);
-    extended_dims.push_back(1);
-    x_dims = framework::make_ddim(extended_dims);
-  }
-
-  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
-                 "Axis should be in range [0, x_dims)");
-
-  int pre, n, post;
-  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
-  if (post == 1) {
-    functor.RunRowWise(n, pre);
-    return;
-  } else {
-    functor.RunMidWise(n, pre, post);
-    return;
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/elementwise_pow_op.cc b/paddle/operators/elementwise_pow_op.cc
deleted file mode 100644
index 5293cc7dd34ccee860c50e964516da9b4d42d29c..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_pow_op.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/elementwise_pow_op.h"
-#include "paddle/operators/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-class ElementwisePowOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwisePowOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Pow", "Out = X ^ Y");
-    AddComment(comment_);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(elementwise_pow, ops::ElementwiseOp,
-                             ops::ElementwisePowOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_pow,
-    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/elementwise_pow_op.cu b/paddle/operators/elementwise_pow_op.cu
deleted file mode 100644
index 643c978e635bc8e9671b47774c2eac5b713f59c2..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_pow_op.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/elementwise_pow_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_pow,
-    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/elementwise_pow_op.h b/paddle/operators/elementwise_pow_op.h
deleted file mode 100644
index 874fd3f09f2afaccfbfca75799cc3448f7393b03..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_pow_op.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cmath>
-#include "paddle/operators/elementwise_op_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct PowFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return std::pow(a, b); }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwisePowKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/elementwise_sub_op.cc b/paddle/operators/elementwise_sub_op.cc
deleted file mode 100644
index d3c51f0a697b7cb07a46871cdba2e84e902fd0f2..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_sub_op.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/elementwise_sub_op.h"
-#include "paddle/operators/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-class ElementwiseSubOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwiseSubOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Sub", "Out = X - Y");
-    AddComment(comment_);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker,
-            elementwise_sub_grad, ops::ElementwiseOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_sub,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_sub_grad,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_sub_op.cu b/paddle/operators/elementwise_sub_op.cu
deleted file mode 100644
index 7a2516ef6a6d5739e9f122455d289cbfeaaf2549..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_sub_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/elementwise_sub_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_sub,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_sub_grad,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h
deleted file mode 100644
index c2749a8e6ba689233dab4f3c72de10bf01f39fab..0000000000000000000000000000000000000000
--- a/paddle/operators/elementwise_sub_op.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/operators/elementwise_op_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct SubFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseSubKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
-  }
-};
-
-template <typename T>
-struct ElementwiseSubGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (-1.0) * dz_e;
-    }
-  }
-};
-
-template <typename T>
-struct ElementwiseSubBroadCastGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (-1.0) *
-                       dz_e.reshape(Eigen::DSizes<int, 2>(pre, n))
-                           .sum(Eigen::array<int, 1>{{0}});
-    }
-  }
-};
-
-template <typename T>
-struct ElementwiseSubBroadCast2GradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N, typename Post>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
-                  Post post) {
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (-1.0) *
-                       dz_e.reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                           .sum(Eigen::array<int, 2>{{0, 2}});
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseSubGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseGradCompute<DeviceContext, T, ElementwiseSubGradFunctor<T>,
-                           ElementwiseSubBroadCastGradFunctor<T>,
-                           ElementwiseSubBroadCast2GradFunctor<T>>(
-        ctx, x, y, out, dout, axis, dx, dy);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/expand_op.cc b/paddle/operators/expand_op.cc
deleted file mode 100644
index 043c93654d33f7c105c89960e18ec72d3557237d..0000000000000000000000000000000000000000
--- a/paddle/operators/expand_op.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/expand_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class ExpandOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
-
-    std::vector<int> expand_times =
-        ctx->Attrs().Get<std::vector<int>>("expand_times");
-    auto x_dims = ctx->GetInputDim("X");
-
-    PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims.size()), expand_times.size(),
-                      "The number of Attr(expand_times)'s value must be equal "
-                      "to the rank of Input(X).");
-    PADDLE_ENFORCE_LE(x_dims.size(), 6,
-                      "The rank of Input(X) must not be greater than 6.");
-
-    std::vector<int64_t> out_shape(x_dims.size());
-    for (size_t i = 0; i < expand_times.size(); ++i) {
-      PADDLE_ENFORCE_GE(expand_times[i], 1,
-                        "Each value of Attr(expand_times) should not be "
-                        "less than 1.");
-      out_shape[i] = x_dims[i] * expand_times[i];
-    }
-
-    ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
-    if (out_shape[0] == x_dims[0]) {
-      ctx->ShareLoD("X", "Out");
-    }
-  }
-};
-
-class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
-             "X is the input to be expanded.");
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
-              "The rank of Output(Out) have the same with Input(X). "
-              "After expanding, size of each dimension of Output(Out) is equal "
-              "to size of the corresponding dimension of Input(X) multiplying "
-              "the corresponding value given by Attr(expand_times).");
-    AddAttr<std::vector<int>>("expand_times",
-                              "Expand times number for each dimension.");
-    AddComment(R"DOC(
-Expand operator tiles the input by given times number. You should set times
-number for each dimension by providing attribute 'expand_times'. The rank of X
-should be in [1, 6]. Please note that size of 'expand_times' must be the same
-with X's rank. Following is a using case:
-
-Input(X) is a 3-D tensor with shape [2, 3, 1]:
-
-        [
-           [[1], [2], [3]],
-           [[4], [5], [6]]
-        ]
-
-Attr(expand_times):  [1, 2, 2]
-
-Output(Out) is a 3-D tensor with shape [2, 6, 2]:
-
-        [
-            [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
-            [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
-        ]
-
-)DOC");
-  }
-};
-
-class ExpandGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    std::vector<int> expand_times =
-        ctx->Attrs().Get<std::vector<int>>("expand_times");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    for (size_t i = 0; i < expand_times.size(); ++i) {
-      PADDLE_ENFORCE_EQ(x_dims[i] * expand_times[i], out_dims[i],
-                        "Each dimension size of Input(Out@GRAD) should be "
-                        "equal to multiplication of crroresponding dimension "
-                        "size of Input(X) and Attr(expand_times) value.");
-    }
-
-    auto x_grad_name = framework::GradVarName("X");
-
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad,
-            ops::ExpandGradOp);
-REGISTER_OP_CPU_KERNEL(
-    expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    expand_grad,
-    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/expand_op.cu b/paddle/operators/expand_op.cu
deleted file mode 100644
index 84e8fa567b80599d9687fed516eac6fbb308b24a..0000000000000000000000000000000000000000
--- a/paddle/operators/expand_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-
-#include "paddle/operators/expand_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    expand, ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    expand_grad,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/expand_op.h b/paddle/operators/expand_op.h
deleted file mode 100644
index a4994cf3a5becb8198f01f9252f3e96beba5a3c3..0000000000000000000000000000000000000000
--- a/paddle/operators/expand_op.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <boost/preprocessor/arithmetic/div.hpp>
-#include <boost/preprocessor/arithmetic/mod.hpp>
-#include <boost/preprocessor/comparison/greater.hpp>
-#include <boost/preprocessor/comparison/greater_equal.hpp>
-#include <boost/preprocessor/control/if.hpp>
-#include <boost/preprocessor/repetition/repeat.hpp>
-#include <iostream>
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-
-#define MAX_RANK_SUPPORTED 6
-
-#define EXPAND_TEMPLATE(z, n, data) \
-  case n + 1: {                     \
-    Expand<n + 1>(context);         \
-    break;                          \
-  }
-#define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~)
-#define COND(n)                                               \
-  BOOST_PP_GREATER_EQUAL(BOOST_PP_DIV(n, MAX_RANK_SUPPORTED), \
-                         BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
-#define EXPAND_GRAD_CASE(n)                                        \
-  case n: {                                                        \
-    ExpandBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
-    break;                                                         \
-  }
-#define EXPAND_GRAD_TEMPLATE(z, n, data) \
-  BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), )
-#define REP_EXPAND_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_GRAD_TEMPLATE, ~)
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class ExpandKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<Tensor>("X")->dims().size();
-    switch (rank) {
-      REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED)
-      default:
-        PADDLE_ENFORCE(false,
-                       "Only support tensor with rank being between 1 and 6.");
-    }
-  }
-
- protected:
-  template <int Rank>
-  void Expand(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<Tensor>("X");
-    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
-    auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<int, Rank> bcast_dims;
-    auto x_dims = in0->dims();
-    for (size_t i = 0; i < expand_times.size(); ++i) {
-      bcast_dims[i] = expand_times[i];
-    }
-    auto x = EigenTensor<T, Rank>::From(*in0);
-    out0->mutable_data<T>(context.GetPlace());
-    auto y = EigenTensor<T, Rank>::From(*out0);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    y.device(place) = x.broadcast(bcast_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ExpandGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
-    auto x_dims = in0->dims();
-    // 1. reshape_dims_vec is the broadcast parameter. For each dimension i,
-    //    if expand_times[i] > 1 and x_dims[i] > 1, i will be splitted to two
-    //    dimensions [expand_times[i], x_dims[i]].
-    // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
-    //    each dimension expanded, the gradients should be summed to original
-    //    size.
-    std::vector<int> reshape_dims_vec;
-    std::vector<int> reduce_dims_vec;
-    for (size_t i = 0; i < expand_times.size(); ++i) {
-      if (expand_times[i] == 1) {
-        reshape_dims_vec.push_back(x_dims[i]);
-      } else {
-        if (x_dims[i] == 1) {
-          reduce_dims_vec.push_back(reshape_dims_vec.size());
-          reshape_dims_vec.push_back(expand_times[i]);
-        } else {
-          reduce_dims_vec.push_back(reshape_dims_vec.size());
-          reshape_dims_vec.push_back(expand_times[i]);
-          reshape_dims_vec.push_back(x_dims[i]);
-        }
-      }
-    }
-
-    int dims = reshape_dims_vec.size() * MAX_RANK_SUPPORTED +
-               reduce_dims_vec.size() - MAX_RANK_SUPPORTED - 1;
-    // no need reduce, just copy
-    if (reduce_dims_vec.size() == 0) {
-      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-      out0->mutable_data<T>(context.GetPlace());
-      framework::Copy(*in0, context.GetPlace(), context.device_context(), out0);
-    } else {
-      switch (dims) {
-        REP_EXPAND_GRAD_TEMPLATE(72)
-        default:
-          PADDLE_ENFORCE(
-              false, "Only support tensor with rank being between 1 and 6.");
-      }
-    }
-  }
-
- protected:
-  template <int Dims>
-  void ExpandBackward(const framework::ExecutionContext& context,
-                      const std::vector<int>& reshape_dims_vec,
-                      const std::vector<int>& reduce_dims_vec) const {
-    size_t reshape_size = Dims / MAX_RANK_SUPPORTED + 1;
-    size_t reduce_size = Dims % MAX_RANK_SUPPORTED + 1;
-    PADDLE_ENFORCE_EQ(reshape_size, reshape_dims_vec.size(),
-                      "Inconsistent size between template Dims and "
-                      "reshape dimensions.");
-    PADDLE_ENFORCE_EQ(reduce_size, reduce_dims_vec.size(),
-                      "Inconsistent size between template Dims and "
-                      "reduce dimensions.");
-    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-    auto x = EigenVector<T>::Flatten(*(context.Input<Tensor>("X")));
-    out0->mutable_data<T>(context.GetPlace());
-    auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<int, Dims / MAX_RANK_SUPPORTED + 1> reshape_dims;
-    for (size_t i = 0; i < reshape_size; ++i) {
-      reshape_dims[i] = reshape_dims_vec[i];
-    }
-    Eigen::DSizes<int, Dims % MAX_RANK_SUPPORTED + 1> reduce_dims;
-    for (size_t i = 0; i < reduce_size; ++i) {
-      reduce_dims[i] = reduce_dims_vec[i];
-    }
-    auto out_grad = EigenVector<T>::Flatten(*in0);
-    x_grad.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        out_grad.reshape(reshape_dims).sum(reduce_dims).reshape(x.dimensions());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
deleted file mode 100644
index 789d01e0022b5c36957f295265a9dc42649b310f..0000000000000000000000000000000000000000
--- a/paddle/operators/feed_op.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/feed_fetch_type.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-class FeedOp : public framework::OperatorBase {
- public:
-  FeedOp(const std::string &type, const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    auto feed_var_name = Input("X");
-    auto *feed_var = scope.FindVar(feed_var_name);
-
-    PADDLE_ENFORCE(feed_var != nullptr,
-                   "Cannot find feed_var in scope, feed_var_name is %s",
-                   feed_var_name);
-
-    auto out_name = this->Output("Out");
-    auto *out_var = scope.FindVar(out_name);
-    PADDLE_ENFORCE(out_var != nullptr,
-                   "Cannot find out_var in scope, out_var_name is %s",
-                   out_name);
-
-    auto col = Attr<int>("col");
-
-    VLOG(3) << "Feed Var " << feed_var_name << "'s " << col << " column to var "
-            << out_name;
-
-    auto &feed_list = feed_var->Get<framework::FeedFetchList>();
-    auto &feed_item = feed_list.at(static_cast<size_t>(col));
-    auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
-
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    if (platform::is_same_place(feed_item.place(), place)) {
-      out_item->ShareDataWith(feed_item);
-    } else {
-      framework::Copy(feed_item, place, dev_ctx, out_item);
-    }
-    out_item->set_lod(feed_item.lod());
-  }
-};
-
-class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  FeedOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input of feed op");
-    AddOutput("Out", "The output of feed op");
-    AddAttr<int>("col", "(int) The column of feed");
-    AddComment(R"DOC(
-Feed Operator.
-
-It should not be configured by users directly.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(feed, paddle::operators::FeedOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::FeedOpInfoMaker);
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
deleted file mode 100644
index 7205ee2a879dfff711ad1cabebe197ef53377a1c..0000000000000000000000000000000000000000
--- a/paddle/operators/fetch_op.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/feed_fetch_type.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-class FetchOp : public framework::OperatorBase {
- public:
-  FetchOp(const std::string &type, const framework::VariableNameMap &inputs,
-          const framework::VariableNameMap &outputs,
-          const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    auto fetch_var_name = Input("X");
-    auto *fetch_var = scope.FindVar(fetch_var_name);
-    PADDLE_ENFORCE(fetch_var != nullptr,
-                   "Cannot find fetch variable in scope, fetch_var_name is %s",
-                   fetch_var_name);
-
-    auto out_name = this->Output("Out");
-    auto *out_var = scope.FindVar(out_name);
-    PADDLE_ENFORCE(out_var != nullptr,
-                   "Cannot find out_var in scope, out_var_name is %s",
-                   out_name);
-
-    auto col = static_cast<size_t>(Attr<int>("col"));
-
-    auto *fetch_list = out_var->GetMutable<framework::FeedFetchList>();
-    auto &src_item = fetch_var->Get<framework::FeedFetchType>();
-
-    if (col >= fetch_list->size()) {
-      fetch_list->resize(col + 1);
-    }
-    auto &dst_item = fetch_list->at(col);
-
-    // FIXME(yuyang18): Should we assume the fetch operator always generate
-    // CPU outputs?
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(src_item.place());
-
-    Copy(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
-    dev_ctx.Wait();
-    dst_item.set_lod(src_item.lod());
-
-    VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name;
-  }
-};
-
-class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  FetchOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input of fetch op");
-    AddOutput("Out", "The output of fetch op");
-    AddAttr<int>("col", "(int) The column of fetch");
-    AddComment(R"DOC(
-Fetch Operator.
-
-It should not be configured by users directly.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(fetch, paddle::operators::FetchOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::FetchOpInfoMaker);
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc
deleted file mode 100644
index c74a5b6ced3af27847dd36511aeab0ee9614415a..0000000000000000000000000000000000000000
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/fill_constant_batch_size_like_op.h"
-
-namespace paddle {
-namespace operators {
-
-class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("Input"),
-        "Input(Input) of FillConstantBatchSizeLikeOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("Out"),
-        "Output(Out) of FillConstantBatchSizeLikeOp should not be null.");
-
-    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    PADDLE_ENFORCE_GT(shape.size(), 0);
-    std::vector<int64_t> shape_int64(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-    auto output_dim = framework::make_ddim(shape_int64);
-
-    int input_dim_idx = ctx->Attrs().Get<int>("input_dim_idx");
-    PADDLE_ENFORCE_GE(input_dim_idx, 0);
-    PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx);
-
-    int output_dim_idx = ctx->Attrs().Get<int>("output_dim_idx");
-    PADDLE_ENFORCE_GE(output_dim_idx, 0);
-    PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), output_dim_idx);
-
-    output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx];
-    ctx->SetOutputDim("Out", output_dim);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
-        ctx.device_context());
-  }
-};
-
-class FillConstantBatchSizeLikeOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  FillConstantBatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
-        .SetDefault(framework::proto::DataType::FP32);
-    AddInput("Input",
-             "(Tensor) Tensor "
-             "whose dim_idx th dimension is used to specify the batch_size");
-    AddOutput("Out",
-              "(Tensor) Tensor of specified shape will be filled "
-              "with the specified value");
-    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
-    AddAttr<int>("input_dim_idx",
-                 "(int, default 0) The index of input's batch size dimension")
-        .SetDefault(0);
-    AddAttr<int>("output_dim_idx",
-                 "(int, default 0) The index of output's batch size dimension")
-        .SetDefault(0);
-    AddAttr<float>("value", "(float, default 0) The value to be filled")
-        .SetDefault(0.0f);
-    AddComment(R"DOC(
-FillConstantBatchSizeLike Operator.
-
-Fill up a variable with specified constant value.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fill_constant_batch_size_like,
-                  ops::FillConstantBatchSizeLikeOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::FillConstantBatchSizeLikeOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    fill_constant_batch_size_like,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           double>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           int>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           int64_t>);
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/operators/fill_constant_batch_size_like_op.cu.cc
deleted file mode 100644
index 608f4b91623e4ddf0240c37be7a8e56117dd40f2..0000000000000000000000000000000000000000
--- a/paddle/operators/fill_constant_batch_size_like_op.cu.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/fill_constant_batch_size_like_op.h"
-#include "paddle/framework/op_registry.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    fill_constant_batch_size_like,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
-                                           float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
-                                           double>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
-                                           int>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
-                                           int64_t>);
diff --git a/paddle/operators/fill_constant_batch_size_like_op.h b/paddle/operators/fill_constant_batch_size_like_op.h
deleted file mode 100644
index 66da9d0307e36db3726f30518c8c57a923e54388..0000000000000000000000000000000000000000
--- a/paddle/operators/fill_constant_batch_size_like_op.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    auto value = ctx.Attr<float>("value");
-
-    math::SetConstant<DeviceContext, T> setter;
-    setter(ctx.template device_context<DeviceContext>(), out,
-           static_cast<T>(value));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc
deleted file mode 100644
index dcd43a30c86b62d79f52ac640f14b295a062146c..0000000000000000000000000000000000000000
--- a/paddle/operators/fill_constant_op.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-class FillConstantInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of FillConstantOp should not be null.");
-    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    ctx->SetOutputDim("Out", framework::make_ddim(shape));
-  }
-};
-
-class FillConstantOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
-    auto data_type =
-        static_cast<framework::proto::DataType>(Attr<int>("dtype"));
-    auto value = Attr<float>("value");
-    auto force_cpu = Attr<bool>("force_cpu");
-    auto &out =
-        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
-    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
-    if (force_cpu) {
-      auto cpu = platform::CPUPlace();
-      out.mutable_data(cpu, framework::ToTypeIndex(data_type));
-    } else {
-      out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
-    }
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-    math::set_constant(dev_ctx, &out, value);
-  }
-};
-
-class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  FillConstantOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
-        .SetDefault(framework::proto::DataType::FP32);
-    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
-    AddAttr<float>("value", "(float, default 0) The value to be filled")
-        .SetDefault(0.0f);
-    AddAttr<bool>("force_cpu",
-                  "(bool, default false) Force fill output variable to cpu "
-                  "memory. Otherwise, fill output variable to the running "
-                  "device")
-        .SetDefault(false);
-    AddOutput("Out",
-              "(Tensor) Tensor of specified shape will be filled "
-              "with the specified value");
-    AddComment(R"DOC(
-FillConstantBatchSizeLike Operator.
-
-Fill up a variable with specified constant value.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
-                  ops::FillConstantInferShape, ops::FillConstantOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/fill_op.cc b/paddle/operators/fill_op.cc
deleted file mode 100644
index 4f5a2ed169565771629fe8df7c25cf23bc94e339..0000000000000000000000000000000000000000
--- a/paddle/operators/fill_op.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/detail/safe_ref.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-struct FillOpVisitor {
-  FillOpVisitor(framework::LoDTensor *tensor, const std::vector<float> &value)
-      : tensor_(tensor), value_(value) {}
-
-  template <typename T>
-  void operator()() const {
-    platform::CPUPlace cpu;
-    auto *data = tensor_->mutable_data<T>(cpu);
-    std::transform(value_.data(), value_.data() + tensor_->numel(), data,
-                   [](float dat) { return static_cast<T>(dat); });
-  }
-
-  framework::LoDTensor *tensor_;
-  const std::vector<float> &value_;
-};
-
-class FillOp : public framework::OperatorBase {
- public:
-  FillOp(const std::string &type, const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    auto &out =
-        detail::Ref(detail::Ref(scope.FindVar(Output("Out")),
-                                "Cannot find variable %s", Output("Out"))
-                        .GetMutable<framework::LoDTensor>());
-    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
-    auto dtype = static_cast<framework::proto::DataType>(Attr<int>("dtype"));
-    platform::CPUPlace cpu;
-    auto force_cpu = Attr<bool>("force_cpu");
-    out.mutable_data(force_cpu ? cpu : place, framework::ToTypeIndex(dtype));
-
-    framework::LoDTensor tensor;
-
-    if (force_cpu || platform::is_cpu_place(place)) {
-      tensor.ShareDataWith(out);
-    } else {
-      // Always make tensor in CPU memory.
-      tensor.Resize(out.dims());
-      tensor.mutable_data(cpu, framework::ToTypeIndex(dtype));
-    }
-
-    framework::VisitDataType(
-        dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));
-
-    if (!force_cpu && platform::is_gpu_place(place)) {
-      // Copy tensor to out
-      platform::DeviceContextPool &pool =
-          platform::DeviceContextPool::Instance();
-      auto &dev_ctx = *pool.Get(place);
-      framework::Copy(tensor, place, dev_ctx, &out);
-    }
-  }
-};
-
-class FillOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  FillOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddComment(R"DOC(Fill operator
-
-Fill an tensor with `value` and `shape`. The type of the tensor is specify by
-`dtype`.
-)DOC");
-    AddOutput("Out", "(LoDTensor) The output tensor.");
-    AddAttr<std::vector<float>>(
-        "value", "The float values of tensor, which are flatten in row major");
-    AddAttr<std::vector<int>>("shape", "The shape of output tensor");
-    AddAttr<int>("dtype", "The data type of output tensor, Default is float")
-        .SetDefault(framework::proto::DataType::FP32);
-    AddAttr<bool>("force_cpu",
-                  "Whether the output tensor must be at CPU memory or not. "
-                  "Default is false.")
-        .SetDefault(false);
-  }
-};
-
-class FillOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    context->SetOutputDim(
-        "Out",
-        framework::make_ddim(context->Attrs().Get<std::vector<int>>("shape")));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fill, ops::FillOp, ops::FillOpInferShape, ops::FillOpMaker);
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
deleted file mode 100644
index b4ae1de876010effff6bf577a4e33043f6760a4f..0000000000000000000000000000000000000000
--- a/paddle/operators/fill_zeros_like_op.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/fill_zeros_like_op.h"
-
-namespace paddle {
-namespace operators {
-
-class FillZerosLikeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of FillZerosLikeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of FillZerosLikeOp should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  FillZerosLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input of fill-zeros-like op.");
-    AddOutput("Out", "The variable will be filled up with zeros.");
-    AddComment(R"DOC(
-FillZerosLike Operator.
-
-Fill up a variable with zeros.
-The output will have the same size as the input.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp,
-                             ops::FillZerosLikeOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    fill_zeros_like,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>);
diff --git a/paddle/operators/fill_zeros_like_op.cu.cc b/paddle/operators/fill_zeros_like_op.cu.cc
deleted file mode 100644
index b7048e8f5857e646e16d5017593f5d3c6e79ea7e..0000000000000000000000000000000000000000
--- a/paddle/operators/fill_zeros_like_op.cu.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/fill_zeros_like_op.h"
-#include "paddle/framework/op_registry.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    fill_zeros_like,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
deleted file mode 100644
index 351ecf8b2f1d945fabdd1d6c5ed56f76f3caae61..0000000000000000000000000000000000000000
--- a/paddle/operators/fill_zeros_like_op.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FillZerosLikeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    math::SetConstant<DeviceContext, T> setter;
-    setter(context.template device_context<DeviceContext>(), out,
-           static_cast<T>(0));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/ftrl_op.cc b/paddle/operators/ftrl_op.cc
deleted file mode 100644
index d00700823d48eb2ea4fc64d1fa2989f18c7c5f18..0000000000000000000000000000000000000000
--- a/paddle/operators/ftrl_op.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/ftrl_op.h"
-
-namespace paddle {
-namespace operators {
-
-class FTRLOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of FTRL should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("SquaredAccumulator"),
-                   "Input(SquaredAccumulator) of FTRL should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LinearAccumulator"),
-                   "Input(LinearAccumulator) of FTRL should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of FTRL should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of FTRL should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of FTRL should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("SquaredAccumOut"),
-                   "Output(SquaredAccumOut) of FTRL should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("LinearAccumOut"),
-                   "Output(LinearAccumOut) of FTRL should not be null.");
-
-    auto param_dim = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Grad"),
-                      "Two input of FTRL Op's dimension must be same.");
-
-    auto lr_dim = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
-                      "Learning Rate should be a scalar.");
-
-    ctx->SetOutputDim("ParamOut", param_dim);
-    ctx->SetOutputDim("SquaredAccumOut", param_dim);
-    ctx->SetOutputDim("LinearAccumOut", param_dim);
-  }
-};
-
-class FTRLOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  FTRLOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Param",
-             "(Tensor, default Tensor<float>) "
-             "Input parameter value that has to be updated.");
-    AddInput("SquaredAccumulator",
-             "(Tensor, default Tensor<float>) "
-             "Accumulator that accumulates squared gradients.");
-    AddInput("LinearAccumulator",
-             "(Tensor, default Tensor<float>) "
-             "Accumulator that accumulates linear gradients.");
-    AddInput("Grad",
-             "(Tensor, default Tensor<float>) "
-             "Input gradient of the parameter.");
-    AddInput("LearningRate",
-             "(Tensor, default Tensor<float>) "
-             "The learning rate should be a tensor of size 1.");
-
-    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
-    AddOutput("SquaredAccumOut",
-              "(Tensor) Output accumulated squared"
-              " gradients.");
-    AddOutput("LinearAccumOut",
-              "(Tensor) Output accumulated linear"
-              " gradients.");
-
-    AddAttr<float>("l1",
-                   "(float, default 0.0) "
-                   "L1 regularization strength.")
-        .SetDefault(0.0f);
-    AddAttr<float>("l2",
-                   "(float, default 0.0) "
-                   "L2 regularization strength.")
-        .SetDefault(0.0f);
-    AddAttr<float>("lr_power",
-                   "(float, default -0.5f) "
-                   "Learning Rate Power.")
-        .SetDefault(-0.5f);
-    AddComment(R"DOC(
-FTRL (Follow The Regularized Leader) Operator.
-
-Optimizer that implements the FTRL algorithm:
-
-$$
-new\_accum = squared\_accum + grad^2 \\
-if (lr\_power == -0.5) {
-   linear\_accum += grad - (\surd(new\_accum) - \surd(squared\_accum)) /
-                   (learning\_rate * param) \\
-} else {
-   linear\_accum += grad -
-                  (new\_accum^{-lr\_power} - accum^{-lr\_power}) /
-                  (learning\_rate * param) \\
-}
-
-x = (l1 * sign(linear\_accum) - linear\_accum)
-if (lr\_power == -0.5) {
-   y = \frac{\surd(new\_accum)}{learning\_rate} + (2 * l2) \\
-   pre\_shrink = \frac{x}{y} \\
-   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) \\
-} else {
-   y = \frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2) \\
-   pre\_shrink = \frac{x}{y} \\
-   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) \\
-}
-squared\_accum += grad^2;
-$$
-
-The paper that proposed Follow The Regularized Leader (FTRL):
-(https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(ftrl, ops::FTRLOp, ops::FTRLOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    ftrl, ops::FTRLOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/ftrl_op.cu b/paddle/operators/ftrl_op.cu
deleted file mode 100644
index abbbe7adbe6bd14f55f7f941c5e6740fada24910..0000000000000000000000000000000000000000
--- a/paddle/operators/ftrl_op.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-You may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed
-under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/ftrl_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    ftrl, ops::FTRLOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/ftrl_op.h b/paddle/operators/ftrl_op.h
deleted file mode 100644
index 4eea04cd8d61bb34fc612e0ca1765a664e329ca9..0000000000000000000000000000000000000000
--- a/paddle/operators/ftrl_op.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class FTRLOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* param_out = ctx.Output<Tensor>("ParamOut");
-    auto* sq_accum_out = ctx.Output<Tensor>("SquaredAccumOut");
-    auto* lin_accum_out = ctx.Output<Tensor>("LinearAccumOut");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-    sq_accum_out->mutable_data<T>(ctx.GetPlace());
-    lin_accum_out->mutable_data<T>(ctx.GetPlace());
-
-    auto grad = ctx.Input<Tensor>("Grad");
-
-    auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
-    auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
-    auto lr_power = static_cast<T>(ctx.Attr<float>("lr_power"));
-
-    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
-    auto sq_accum =
-        EigenVector<T>::Flatten(*ctx.Input<Tensor>("SquaredAccumulator"));
-    auto lin_accum =
-        EigenVector<T>::Flatten(*ctx.Input<Tensor>("LinearAccumulator"));
-    auto g = EigenVector<T>::Flatten(*grad);
-    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
-
-    auto p_out = EigenVector<T>::Flatten(*param_out);
-    auto s_acc_out = EigenVector<T>::Flatten(*sq_accum_out);
-    auto l_acc_out = EigenVector<T>::Flatten(*lin_accum_out);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
-
-    auto new_accum = sq_accum + g * g;
-    // Special case for lr_power = -0.5
-    if (lr_power == static_cast<T>(-0.5)) {
-      l_acc_out.device(place) =
-          lin_accum + g -
-          ((new_accum.sqrt() - sq_accum.sqrt()) / lr.broadcast(grad_dsize)) * p;
-    } else {
-      l_acc_out.device(place) =
-          lin_accum + g -
-          ((new_accum.pow(-lr_power) - sq_accum.pow(-lr_power)) /
-           lr.broadcast(grad_dsize)) *
-              p;
-    }
-
-    auto x = (l_acc_out.constant(l1) * l_acc_out.sign() - l_acc_out);
-    if (lr_power == static_cast<T>(-0.5)) {
-      auto y = (new_accum.sqrt() / lr.broadcast(grad_dsize)) +
-               l_acc_out.constant(static_cast<T>(2) * l2);
-      auto pre_shrink = x / y;
-      p_out.device(place) =
-          (l_acc_out.abs() > l_acc_out.constant(l1))
-              .select(pre_shrink, p.constant(static_cast<T>(0)));
-    } else {
-      auto y = (new_accum.pow(-lr_power) / lr.broadcast(grad_dsize)) +
-               l_acc_out.constant(static_cast<T>(2) * l2);
-      auto pre_shrink = x / y;
-      p_out.device(place) =
-          (l_acc_out.abs() > l_acc_out.constant(l1))
-              .select(pre_shrink, p.constant(static_cast<T>(0)));
-    }
-
-    s_acc_out.device(place) = sq_accum + g * g;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/gather.cu.h b/paddle/operators/gather.cu.h
deleted file mode 100644
index 9840c066f053e5e1cd1c756d4dd938eace1a5eb4..0000000000000000000000000000000000000000
--- a/paddle/operators/gather.cu.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using platform::DeviceContext;
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-template <typename T>
-__global__ void GatherCUDAKernel(const T* params, const int* indices, T* output,
-                                 size_t index_size, size_t slice_size) {
-  CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
-    int indices_i = i / slice_size;
-    int slice_i = i - indices_i * slice_size;  // offset inside the slice
-    int gather_i = indices[indices_i];
-    int params_i = gather_i * slice_size + slice_i;
-    *(output + i) = *(params + params_i);
-  }
-}
-
-/**
- * A thin wrapper on gpu tensor
- * Return a new tensor from source tensor, gathered according to index
- * input[src]: type-T source Tensor
- * input[index]: type-int index Tensor (1-D)
- * return: output tensor
- */
-template <typename T>
-void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
-               const Tensor& index, Tensor* output) {
-  // PADDLE_ENFORCE(platform::is_gpu_place(place));
-  // check index of shape 1-D
-  PADDLE_ENFORCE(index.dims().size() == 1);
-  int index_size = index.dims()[0];
-
-  auto src_dims = src.dims();
-  framework::DDim output_dims(src_dims);
-  output_dims[0] = index_size;
-
-  // slice size
-  int slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-
-  const T* p_src = src.data<T>();
-  const int* p_index = index.data<int>();
-  T* p_output = output->data<T>();
-
-  int block = 512;
-  int n = slice_size * index_size;
-  int grid = (n + block - 1) / block;
-
-  GatherCUDAKernel<T><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      p_src, p_index, p_output, index_size, slice_size);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/gather.h b/paddle/operators/gather.h
deleted file mode 100644
index 052db49cb3c2594eca8b9a5e3716689480089703..0000000000000000000000000000000000000000
--- a/paddle/operators/gather.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory.h>
-#include <cstring>
-
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-/**
- * A thin wrapper for gathering on cpu tensor
- * Return a new tensor from source tensor, gathered according to index
- * input[src]: type-T source Tensor
- * input[index]: type-int index Tensor (1-D)
- * return: output tensor
- */
-template <typename T>
-void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
-               const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
-  // check index of shape 1-D
-  PADDLE_ENFORCE(index.dims().size() == 1);
-  int index_size = index.dims()[0];
-
-  auto src_dims = src.dims();
-  framework::DDim output_dims(src_dims);
-  output_dims[0] = index_size;
-
-  const T* p_src = src.data<T>();
-  const int* p_index = index.data<int>();
-  T* p_output = output->data<T>();
-
-  // slice size
-  int slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-
-  const size_t slice_bytes = slice_size * sizeof(T);
-
-  for (int i = 0; i < index_size; ++i) {
-    int index_ = p_index[i];
-    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
deleted file mode 100644
index 597fdad0794ec076669a0b2e6157e5c74c80d735..0000000000000000000000000000000000000000
--- a/paddle/operators/gather_op.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/gather_op.h"
-#include "paddle/framework/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-class GatherOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of GatherOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Index"),
-                   "Input(Index) of GatherOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of GatherOp should not be null.");
-
-    auto index_dims = ctx->GetInputDim("Index");
-    PADDLE_ENFORCE(index_dims.size() == 1);
-    int batch_size = ctx->GetInputDim("Index")[0];
-    PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
-    framework::DDim output_dims(ctx->GetInputDim("X"));
-    output_dims[0] = batch_size;
-    ctx->SetOutputDim("Out", output_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
-  }
-};
-
-class GatherGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
-  }
-};
-
-class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  GatherOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The source input of gather op");
-    AddInput("Index", "The index input of gather op");
-    AddOutput("Out", "The output of gather op");
-    AddComment(R"DOC(
-Gather Operator.
-
-$Out = X[Index]$
-
-Out is obtained by gathering entries of the outer-most dimension 
-of X indexed by Index and concatenate them together.
-
-Example:
-
-X = [[1, 2],
-     [3, 4],
-     [5, 6]]
-
-Index = [[1, 2]]
-
-Then:
-
-Out = [[3, 4],
-       [5, 6]]
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
-            ops::GatherGradOp);
-REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>);
-REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>);
diff --git a/paddle/operators/gather_op.cu b/paddle/operators/gather_op.cu
deleted file mode 100644
index eec2415e1de2434de0a920567863d421d2d3032d..0000000000000000000000000000000000000000
--- a/paddle/operators/gather_op.cu
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gather.cu.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/operators/gather_op.h"
-#include "scatter.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class GatherOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *output = ctx.Output<Tensor>("Out");
-
-    output->mutable_data<T>(ctx.GetPlace());
-
-    GPUGather<T>(ctx.device_context(), *x, *index, output);
-  }
-};
-
-template <typename T>
-class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto *Index = ctx.Input<Tensor>("Index");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *x = ctx.Input<Tensor>("X");
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-
-    GPUScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>);
-REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>);
diff --git a/paddle/operators/gather_op.h b/paddle/operators/gather_op.h
deleted file mode 100644
index 1a1ba0c41aef95d3dc8cc929db72770a7bd08b18..0000000000000000000000000000000000000000
--- a/paddle/operators/gather_op.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "gather.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class GatherOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "This kernel only runs on CPU.");
-
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *output = ctx.Output<Tensor>("Out");
-
-    output->mutable_data<T>(ctx.GetPlace());
-
-    CPUGather<T>(ctx.device_context(), *x, *index, output);
-  }
-};
-
-template <typename T>
-class GatherGradientOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "This kernel only runs on CPU.");
-
-    auto *Index = ctx.Input<Tensor>("Index");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-
-    ScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/gather_test.cc b/paddle/operators/gather_test.cc
deleted file mode 100644
index cbd86b87961ee24aa889e208de5ac38e03a33135..0000000000000000000000000000000000000000
--- a/paddle/operators/gather_test.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/gather.h"
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/place.h"
-
-#include <gtest/gtest.h>
-#include <iostream>
-#include <string>
-
-TEST(Gather, GatherData) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::operators;
-
-  Tensor* src = new Tensor();
-  Tensor* index = new Tensor();
-  Tensor* output = new Tensor();
-
-  int* p_src = nullptr;
-  int* p_index = nullptr;
-  p_src = src->mutable_data<int>(make_ddim({3, 4}), CPUPlace());
-  p_index = index->mutable_data<int>(make_ddim({2}), CPUPlace());
-
-  for (int i = 0; i < 12; ++i) p_src[i] = i;
-  p_index[0] = 1;
-  p_index[1] = 0;
-
-  int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
-
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  CPUGather<int>(ctx, *src, *index, output);
-
-  for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
-  for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
-
-  delete src;
-  delete index;
-  delete output;
-}
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
deleted file mode 100644
index 2dca05760ecc98f822b8c426af3152d365526d2b..0000000000000000000000000000000000000000
--- a/paddle/operators/gaussian_random_op.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CPUGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.Attr<float>("mean");
-    float std = context.Attr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
-    std::normal_distribution<T> dist(mean, std);
-    int64_t size = tensor->numel();
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(engine);
-    }
-  }
-};
-
-class GaussianRandomOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of GaussianRandomOp should not be null.");
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    std::vector<int64_t> temp;
-    temp.reserve(shape.size());
-    for (auto dim : shape) {
-      temp.push_back(static_cast<int64_t>(dim));
-    }
-    PADDLE_ENFORCE(shape.size() > 0UL,
-                   "shape can be one int or array. shape must be set.");
-    ctx->SetOutputDim("Out", framework::make_ddim(temp));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
-        ctx.device_context());
-  }
-};
-
-class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  GaussianRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddOutput("Out", "Output matrix of gaussian random op");
-
-    AddAttr<std::vector<int>>("shape",
-                              "(vector<int>) "
-                              "The dimension of random tensor.");
-    AddAttr<float>("mean",
-                   "(float, default 0.0) "
-                   "mean of random tensor.")
-        .SetDefault(.0f);
-    AddAttr<float>("std",
-                   "(float, default 1.0) "
-                   "std of random tensor.")
-        .SetDefault(1.0f);
-    AddAttr<int>("seed",
-                 "(int, default 0) "
-                 "Random seed of generator."
-                 "0 means use system wide seed.")
-        .SetDefault(0);
-    AddAttr<int>("dtype",
-                 "(int, default 5(FP32)) "
-                 "Output data type.")
-        .SetDefault(framework::proto::DataType::FP32);
-
-    AddComment(R"DOC(
-GaussianRandom Operator.
-
-Used to initialize tensors with gaussian random generator.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp,
-                             ops::GaussianRandomOpMaker);
-REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel<float>);
diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu
deleted file mode 100644
index 8a70db17e17ebf7d5bad1e1ee6a2acdff1b85a09..0000000000000000000000000000000000000000
--- a/paddle/operators/gaussian_random_op.cu
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct GaussianGenerator {
-  T mean_, std_;
-  unsigned int seed_;
-
-  __host__ __device__ GaussianGenerator(T mean, T std, int seed)
-      : mean_(mean), std_(std), seed_(seed) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::normal_distribution<T> dist(mean_, std_);
-    rng.discard(n);
-    return dist(rng);
-  }
-};
-
-template <typename T>
-class GPUGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-    }
-    T mean = static_cast<T>(context.Attr<float>("mean"));
-    T std = static_cast<T>(context.Attr<float>("std"));
-    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
-    int64_t size = tensor->numel();
-    thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                      thrust::device_ptr<T>(data),
-                      GaussianGenerator<T>(mean, std, seed));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(gaussian_random,
-                        paddle::operators::GPUGaussianRandomKernel<float>);
diff --git a/paddle/operators/get_places_op.cc b/paddle/operators/get_places_op.cc
deleted file mode 100644
index 24fafb23074c358669715f1840246c3520f948c7..0000000000000000000000000000000000000000
--- a/paddle/operators/get_places_op.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thread>
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/detail/safe_ref.h"
-#include "paddle/platform/place.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/platform/gpu_info.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-static size_t CUDADevCount() {
-#ifdef PADDLE_WITH_CUDA
-  return platform::GetCUDADeviceCount();
-#else
-  return 0UL;
-#endif
-}
-
-class GetPlacesOp : public framework::OperatorBase {
- public:
-  GetPlacesOp(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    bool is_gpu;
-    if (Attr<std::string>("device_type") == "AUTO") {
-      is_gpu = platform::is_gpu_place(place);
-    } else {
-      is_gpu = Attr<std::string>("device_type") == "CUDA";
-    }
-    auto device_count = static_cast<size_t>(Attr<int>("device_count"));
-    if (device_count == 0) {
-      device_count =
-          is_gpu ? CUDADevCount() : std::thread::hardware_concurrency();
-    }
-    PADDLE_ENFORCE_NE(device_count, 0, "Cannot indicate %s device count",
-                      is_gpu ? "GPU" : "CPU");
-
-    auto out_var_name = Output("Out");
-    auto &places =
-        *(detail::Ref(scope.FindVar(out_var_name),
-                      "Output variable %s cannot be found", out_var_name)
-              .GetMutable<platform::PlaceList>());
-    places.reserve(device_count);
-    if (is_gpu) {
-      PADDLE_ENFORCE_LE(device_count, CUDADevCount(),
-                        "Only %d CUDA devices found, cannot set to %d",
-                        CUDADevCount(), device_count);
-      for (size_t i = 0; i < device_count; ++i) {
-        places.emplace_back(platform::CUDAPlace(static_cast<int>(i)));
-      }
-    } else {
-      for (size_t i = 0; i < device_count; ++i) {
-        places.emplace_back(platform::CPUPlace());
-      }
-    }
-  }
-};
-
-class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  GetPlacesOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddOutput("Out", "vector of Place");
-    AddAttr<int>("device_count", "device count").SetDefault(0);
-    AddAttr<std::string>("device_type", "device type")
-        .InEnum({"CUDA", "CPU", "AUTO"})
-        .SetDefault("AUTO");
-    AddComment(R"DOC(
-Returns a list of places based on flags. The list will be used for parallel
-execution.
-)DOC");
-  }
-};
-
-class GetPlacesInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &o_name : op_desc.Output("Out")) {
-      block->FindRecursiveOrCreateVar(o_name).SetType(
-          framework::proto::VarDesc::PLACE_LIST);
-    }
-  }
-};
-
-class GetPlacesInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    // Do nothing
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(get_places, ops::GetPlacesOp, ops::GetPlacesOpProtoMaker,
-                  ops::GetPlacesInferVarType, ops::GetPlacesInferShape,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/gru_op.cc b/paddle/operators/gru_op.cc
deleted file mode 100644
index fb901b639492a179925ff852f9030fc6674d1f63..0000000000000000000000000000000000000000
--- a/paddle/operators/gru_op.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/gru_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class GRUOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(%s) of GRUOp should not be null.", "Input");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(%s) of GRUOp should not be null.", "Weight");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
-                   "Output(%s) of GRUOp should not be null.", "BatchGate");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"),
-                   "Output(%s) of GRUOp should not be null.",
-                   "BatchResetHiddenPrev");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
-                   "Output(%s) of GRUOp should not be null.", "BatchHidden");
-    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
-                   "Output(%s) of GRUOp should not be null.", "Hidden");
-    auto input_dims = ctx->GetInputDim("Input");
-    auto weight_dims = ctx->GetInputDim("Weight");
-    int input_size = input_dims[1];
-    int frame_size = weight_dims[0];
-    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
-                      "The input_size must be 3 times of frame_size in GRUOp.");
-    PADDLE_ENFORCE_EQ(
-        weight_dims[1], frame_size * 3,
-        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-    if (ctx->HasInput("H0")) {
-      auto h0_dims = ctx->GetInputDim("H0");
-      PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
-                        "The width of H0 must be equal to frame_size.");
-    }
-    if (ctx->HasInput("Bias")) {
-      auto bias_dims = ctx->GetInputDim("Bias");
-      int bias_height = bias_dims[0];
-      int bias_width = bias_dims[1];
-      PADDLE_ENFORCE_EQ(bias_height, 1,
-                        "The shape of Bias must be [1, frame_size * 3].");
-      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
-                        "The shape of Bias must be [1, frame_size * 3].");
-    }
-    ctx->SetOutputDim("BatchGate", input_dims);
-    ctx->SetOutputDim("BatchResetHiddenPrev", {input_dims[0], frame_size});
-    ctx->SetOutputDim("BatchHidden", {input_dims[0], frame_size});
-    ctx->SetOutputDim("Hidden", {input_dims[0], frame_size});
-    ctx->ShareLoD("Input", "Hidden");
-  }
-};
-
-class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  GRUOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Input",
-             "(LoDTensor) The first input is a LodTensor, which supports "
-             "variable-time length input sequence. The underlying tensor in "
-             "this LoDTenosr is a matrix with shape (T X 3D), where, T is the "
-             "total time steps in this mini-batch, D is the hidden size.");
-    AddInput("H0",
-             "(Tensor, optional) The initial hidden state is an optional "
-             "input. This is a tensor with shape (N x D), where N is the "
-             "batch size, D is the hidden size.")
-        .AsDispensable();
-    AddInput(
-        "Weight",
-        "(Tensor) The learnable hidden-hidden weight matrix with shape "
-        "(D x 3D), where D is the hidden size. The elements continuous in "
-        "memory can be divided into two parts. The first part are weights of "
-        "the update gate and reset gate with shape (D x 2D), and the second "
-        "part are weights of output candidate with shape (D x D).");
-    AddInput("Bias",
-             "(Tensor, optional) Bias vector with shape (1 x 3D) concating "
-             "bias of the update gate, reset gate and output candidate.")
-        .AsDispensable();
-    AddOutput("BatchGate",
-              "(LoDTensor) To compute with batches, sequence data will be "
-              "reorganized into several successive batches each containing "
-              "data from the same time step. The LoDTensor BatchGate contains "
-              "the update gate, reset gate and output candidate values "
-              "organized in batches. The LoD size is 2. The first LoD contains "
-              "the batch offsets and the second LoD contains the indexes in "
-              "the raw sequence data.")
-        .AsIntermediate();
-    AddOutput(
-        "BatchResetHiddenPrev",
-        "(LoDTensor) The reseted hidden state LoDTensor organized in batches. "
-        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
-        "with `BatchGate`.")
-        .AsIntermediate();
-    AddOutput(
-        "BatchHidden",
-        "(LoDTensor) The hidden state LoDTensor organized in batches.  "
-        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
-        "with `BatchGate`.")
-        .AsIntermediate();
-    AddOutput(
-        "Hidden",
-        "(LoDTensor) the hidden state LoDTensor organized in sequences. "
-        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
-        "with `BatchGate`.");
-    AddAttr<std::string>("activation",
-                         "(string, default tanh) "
-                         "The activation type used for output candidate {h}_t.")
-        .SetDefault("tanh");
-    AddAttr<std::string>(
-        "gate_activation",
-        "(string, default sigmoid) "
-        "The activation type used in update gate and reset gate.")
-        .SetDefault("sigmoid");
-    AddAttr<bool>("is_reverse",
-                  "(bool, defalut: False) "
-                  "whether to compute reversed GRU.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-GRU Operator implements part calculations of the complete GRU as following:
-
-$$
-update\_gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
-reset\_gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
-output\_candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
-output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
-$$
-
-@note To implement the complete GRU, fully-connected operator must be used
-before to feed xu, xr and xc as the Input of GRU operator.
-)DOC");
-  }
-};
-
-class GRUGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(%s) of GRUGradOp should not be null.", "Input");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(%s) of GRUGradOp should not be null.", "Weight");
-    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
-                   "Input(%s) of GRUGradOp should not be null.", "BatchGate");
-    PADDLE_ENFORCE(ctx->HasInput("BatchResetHiddenPrev"),
-                   "Input(%s) of GRUGradOp should not be null.",
-                   "BatchResetHiddenPrev");
-    PADDLE_ENFORCE(ctx->HasInput("BatchHidden"),
-                   "Input(%s) of GRUOp should not be null.", "BatchHidden");
-    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
-                   "Input(%s) of GRUGradOp should not be null.", "Hidden");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
-                   "Input(%s@GRAD) of GRUGradOp should not be null.", "Hidden");
-    auto input_dims = ctx->GetInputDim("Input");
-    auto weight_dims = ctx->GetInputDim("Weight");
-    int input_size = input_dims[1];
-    int frame_size = weight_dims[0];
-    int weight_height = weight_dims[0];
-    int weight_width = weight_dims[1];
-    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
-                      "The input_size must be 3 times of frame_size in GRUOp.");
-    PADDLE_ENFORCE_EQ(
-        weight_height, frame_size,
-        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-    PADDLE_ENFORCE_EQ(
-        weight_width, frame_size * 3,
-        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-    if (ctx->HasInput("H0")) {
-      auto h0_dims = ctx->GetInputDim("H0");
-      PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
-                        "The width of H0 must be equal to frame_size.");
-      auto h0_grad_name = framework::GradVarName("H0");
-      if (ctx->HasOutput(h0_grad_name))
-        ctx->SetOutputDim(h0_grad_name, h0_dims);
-    }
-    if (ctx->HasInput("Bias")) {
-      auto bias_dims = ctx->GetInputDim("Bias");
-      int bias_height = bias_dims[0];
-      int bias_width = bias_dims[1];
-      PADDLE_ENFORCE_EQ(bias_height, 1,
-                        "The shape of Bias must be [1, frame_size * 3].");
-      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
-                        "The shape of Bias must be [1, frame_size * 3].");
-      auto bias_grad_name = framework::GradVarName("Bias");
-      if (ctx->HasOutput(bias_grad_name))
-        ctx->SetOutputDim(bias_grad_name, bias_dims);
-    }
-    auto input_grad_name = framework::GradVarName("Input");
-    if (ctx->HasOutput(input_grad_name))
-      ctx->SetOutputDim(input_grad_name, input_dims);
-    auto weight_grad_name = framework::GradVarName("Weight");
-    if (ctx->HasOutput(weight_grad_name))
-      ctx->SetOutputDim(weight_grad_name, weight_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp);
-REGISTER_OP_CPU_KERNEL(
-    gru, ops::GRUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GRUKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    gru_grad, ops::GRUGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GRUGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/gru_op.cu.cc b/paddle/operators/gru_op.cu.cc
deleted file mode 100644
index 9cb0cc42d5589792aae6d99cec807aac6e4991b6..0000000000000000000000000000000000000000
--- a/paddle/operators/gru_op.cu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/gru_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    gru, ops::GRUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GRUKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    gru_grad, ops::GRUGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GRUGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h
deleted file mode 100644
index a08bd4233b02d021aaa64bafe4b855f11a60d338..0000000000000000000000000000000000000000
--- a/paddle/operators/gru_op.h
+++ /dev/null
@@ -1,261 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/operators/math/gru_compute.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/sequence2batch.h"
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src,
-                             framework::Vector<size_t> index_lod,
-                             framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
-  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index_lod, *dst, indexed_src);
-}
-
-template <typename DeviceContext, typename T>
-class GRUKernel : public framework::OpKernel<T> {
- public:
-  void BatchCompute(const framework::ExecutionContext& context) const {
-    auto* input = context.Input<LoDTensor>("Input");
-    auto* h0 = context.Input<Tensor>("H0");
-    auto* weight = context.Input<Tensor>("Weight");
-    const T* weight_data = weight->data<T>();
-    auto* bias = context.Input<Tensor>("Bias");
-    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
-    batch_gate->mutable_data<T>(context.GetPlace());
-    auto* batch_reset_hidden_prev =
-        context.Output<LoDTensor>("BatchResetHiddenPrev");
-    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
-    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
-    batch_hidden->mutable_data<T>(context.GetPlace());
-    auto* hidden = context.Output<LoDTensor>("Hidden");
-    hidden->mutable_data<T>(context.GetPlace());
-
-    context.ShareLoD("Input", "Hidden");
-
-    auto hidden_dims = hidden->dims();
-
-    bool is_reverse = context.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    to_batch(dev_ctx, *input, *batch_gate, true, is_reverse);
-
-    if (bias) {
-      math::RowwiseAdd<DeviceContext, T> add_bias;
-      add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
-    }
-
-    int frame_size = hidden_dims[1];
-    math::GRUMetaValue<T> gru_value;
-    gru_value.gate_weight = const_cast<T*>(weight_data);
-    gru_value.state_weight =
-        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
-    Tensor ordered_h0;
-
-    framework::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (h0) {
-      // Since the batch computing for GRU reorders the input sequences
-      // according to their length. The initialized cell state also needs
-      // to reorder.
-      ReorderInitState<DeviceContext, T>(
-          context.template device_context<DeviceContext>(), *h0, order,
-          &ordered_h0, true);
-      gru_value.prev_out_value = ordered_h0.data<T>();
-    } else {
-      gru_value.prev_out_value = nullptr;
-    }
-    auto batch_starts = batch_gate->lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    auto active_node = math::detail::GetActivationType(
-        context.Attr<std::string>("activation"));
-    auto active_gate = math::detail::GetActivationType(
-        context.Attr<std::string>("gate_activation"));
-    for (size_t n = 0; n < num_batch; n++) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-      int cur_batch_size = bend - bstart;
-
-      Tensor gate_t = batch_gate->Slice(bstart, bend);
-      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
-      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-      gru_value.output_value = hidden_t.data<T>();
-      gru_value.gate_value = gate_t.data<T>();
-      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-      math::GRUUnitFunctor<DeviceContext, T>::compute(
-          dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
-          active_gate);
-      gru_value.prev_out_value = gru_value.output_value;
-    }
-
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_hidden->set_lod(batch_gate->lod());
-    to_seq(dev_ctx, *batch_hidden, *hidden);
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    BatchCompute(context);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GRUGradKernel : public framework::OpKernel<T> {
- public:
-  void BatchCompute(const framework::ExecutionContext& context) const {
-    auto* h0 = context.Input<Tensor>("H0");
-    auto* weight = context.Input<Tensor>("Weight");
-    const T* weight_data = weight->data<T>();
-    auto* batch_gate = context.Input<LoDTensor>("BatchGate");
-    auto* batch_reset_hidden_prev =
-        context.Input<LoDTensor>("BatchResetHiddenPrev");
-    auto* batch_hidden = context.Input<LoDTensor>("BatchHidden");
-    auto* hidden = context.Input<LoDTensor>("Hidden");
-    auto* hidden_grad =
-        context.Input<LoDTensor>(framework::GradVarName("Hidden"));
-    auto* input_grad =
-        context.Output<LoDTensor>(framework::GradVarName("Input"));
-    auto* h0_grad = context.Output<Tensor>(framework::GradVarName("H0"));
-    auto* weight_grad =
-        context.Output<Tensor>(framework::GradVarName("Weight"));
-    auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
-
-    auto gate_dims = batch_gate->dims();
-    auto hidden_dims = hidden->dims();
-    int frame_size = hidden_dims[1];
-
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad;
-    batch_hidden_grad.mutable_data<T>(hidden_dims, context.GetPlace());
-    batch_gate_grad.mutable_data<T>(gate_dims, context.GetPlace());
-    batch_reset_hidden_prev_grad.mutable_data<T>(hidden_dims,
-                                                 context.GetPlace());
-    math::SetConstant<DeviceContext, T> zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    zero(dev_ctx, &batch_hidden_grad, static_cast<T>(0.0));
-    zero(dev_ctx, &batch_gate_grad, static_cast<T>(0.0));
-    zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
-
-    Tensor ordered_h0, ordered_h0_grad;
-
-    framework::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (h0) {
-      ReorderInitState<DeviceContext, T>(dev_ctx, *h0, order, &ordered_h0,
-                                         true);
-    }
-    if (h0_grad) {
-      ordered_h0_grad.mutable_data<T>(h0_grad->dims(), context.GetPlace());
-      zero(context.template device_context<DeviceContext>(), &ordered_h0_grad,
-           static_cast<T>(0.0));
-    }
-
-    bool is_reverse = context.Attr<bool>("is_reverse");
-    batch_hidden_grad.set_lod(batch_hidden->lod());
-    to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
-
-    math::GRUMetaValue<T> gru_value;
-    gru_value.gate_weight = const_cast<T*>(weight_data);
-    gru_value.state_weight =
-        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
-
-    math::GRUMetaGrad<T> gru_grad;
-    if (weight_grad) {
-      gru_grad.gate_weight_grad =
-          weight_grad->mutable_data<T>(context.GetPlace());
-      zero(dev_ctx, weight_grad, static_cast<T>(0.0));
-      gru_grad.state_weight_grad =
-          weight_grad->data<T>() + 2 * frame_size * frame_size;
-    } else {
-      gru_grad.gate_weight_grad = nullptr;
-      gru_grad.state_weight_grad = nullptr;
-    }
-
-    auto batch_starts = batch_hidden_grad.lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    auto active_node = math::detail::GetActivationType(
-        context.Attr<std::string>("activation"));
-    auto active_gate = math::detail::GetActivationType(
-        context.Attr<std::string>("gate_activation"));
-    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-      int cur_batch_size = bend - bstart;
-
-      Tensor gate_t = batch_gate->Slice(bstart, bend);
-      gru_value.gate_value = gate_t.data<T>();
-      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
-      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-
-      Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
-      gru_grad.output_grad = hidden_grad_t.data<T>();
-      Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
-      gru_grad.gate_grad = gate_grad_t.data<T>();
-      Tensor reset_hidden_prev_grad_t =
-          batch_reset_hidden_prev_grad.Slice(bstart, bend);
-      gru_grad.reset_output_grad = reset_hidden_prev_grad_t.data<T>();
-      if (n == 0) {
-        gru_value.prev_out_value = h0 ? ordered_h0.data<T>() : nullptr;
-        gru_grad.prev_out_grad =
-            h0 && h0_grad ? ordered_h0_grad.data<T>() : nullptr;
-      } else {
-        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
-        Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
-        gru_value.prev_out_value = hidden_prev_t.data<T>();
-        Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
-        gru_grad.prev_out_grad = hidden_prev_grad_t.data<T>();
-      }
-
-      math::GRUUnitGradFunctor<DeviceContext, T>::compute(
-          dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, active_node,
-          active_gate);
-    }
-    if (input_grad) {
-      input_grad->mutable_data<T>(context.GetPlace());
-      math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-      batch_gate_grad.set_lod(batch_gate->lod());
-      to_seq(dev_ctx, batch_gate_grad, *input_grad);
-    }
-    if (bias_grad) {
-      bias_grad->mutable_data<T>(context.GetPlace());
-      math::ColwiseSum<DeviceContext, T> col_sum;
-      col_sum(dev_ctx, batch_gate_grad, bias_grad);
-    }
-    if (h0 && h0_grad) {
-      ReorderInitState<DeviceContext, T>(dev_ctx, ordered_h0_grad, order,
-                                         h0_grad, false);
-    }
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    BatchCompute(context);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/gru_unit_op.cc b/paddle/operators/gru_unit_op.cc
deleted file mode 100644
index c354293be7720abd7d96b1c4311b32049a16730c..0000000000000000000000000000000000000000
--- a/paddle/operators/gru_unit_op.cc
+++ /dev/null
@@ -1,209 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/gru_unit_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class GRUUnitOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(%s) of GRUUnitOp should not be null.", "Input");
-    PADDLE_ENFORCE(ctx->HasInput("HiddenPrev"),
-                   "Input(%s) of GRUUnitOp should not be null.", "HiddenPrev");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(%s) of GRUUnitOp should not be null.", "Weight");
-    PADDLE_ENFORCE(ctx->HasOutput("Gate"),
-                   "Output(%s) of GRUUnitOp should not be null.", "Gate");
-    PADDLE_ENFORCE(ctx->HasOutput("ResetHiddenPrev"),
-                   "Output(%s) of GRUUnitOp should not be null.",
-                   "ResetHiddenPrev");
-    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
-                   "Output(%s) of GRUUnitOp should not be null.", "Hidden");
-    auto input_dims = ctx->GetInputDim("Input");
-    auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev");
-    auto weight_dims = ctx->GetInputDim("Weight");
-    int batch_size = input_dims[0];
-    int input_size = input_dims[1];
-    int frame_size = hidden_prev_dims[1];
-    int weight_height = weight_dims[0];
-    int weight_width = weight_dims[1];
-    PADDLE_ENFORCE_EQ(
-        input_size, frame_size * 3,
-        "The input_size must be 3 times of frame_size in GRUUnitOp.");
-    PADDLE_ENFORCE_EQ(
-        weight_height, frame_size,
-        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-    PADDLE_ENFORCE_EQ(
-        weight_width, frame_size * 3,
-        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-    if (ctx->HasInput("Bias")) {
-      auto bias_dims = ctx->GetInputDim("Bias");
-      int bias_height = bias_dims[0];
-      int bias_width = bias_dims[1];
-      PADDLE_ENFORCE_EQ(bias_height, 1,
-                        "The shape of Bias must be [1, frame_size * 3].");
-      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
-                        "The shape of Bias must be [1, frame_size * 3].");
-    }
-    ctx->SetOutputDim("Gate", {batch_size, frame_size * 3});
-    ctx->SetOutputDim("ResetHiddenPrev", {batch_size, frame_size});
-    ctx->SetOutputDim("Hidden", {batch_size, frame_size});
-  }
-};
-
-class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  GRUUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Input",
-             "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
-             "input.");
-    AddInput("HiddenPrev",
-             "(Tensor) Matrix with shape [batch_size, frame_size] for the "
-             "states of previous time step.");
-    AddInput(
-        "Weight",
-        "(Tensor) Weight matrix with shape [frame_size, frame_size * 3]. "
-        "The elements continuous in memory can be divided into two parts. "
-        "The first part are weights of the update gate and reset gate "
-        "with shape [frame_size, frame_size * 2], and the second part are "
-        "weights of output candidate with shape [frame_size, frame_size].");
-    AddInput(
-        "Bias",
-        "(Tensor) Bias vector with shape [1, frame_size * 3] concatenating "
-        "bias of the update gate, reset gate and output candidate.")
-        .AsDispensable();
-    AddOutput("Gate",
-              "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
-              "output of update gate, reset gate and output candidate.")
-        .AsIntermediate();
-    AddOutput("ResetHiddenPrev",
-              "(Tensor) Matrix with shape [batch_size, frame_size] for the "
-              "reseted hidden state of previous time step.")
-        .AsIntermediate();
-    AddOutput("Hidden",
-              "(Tensor) The GRU hidden state of the current time step "
-              "with shape [batch_size, frame_size].");
-    AddAttr<int>("activation",
-                 "(enum int, default tanh) "
-                 "The activation type used for output candidate {h}_t.")
-        .SetDefault(tanh)
-        .InEnum({identity, sigmoid, tanh, relu});
-    AddAttr<int>("gate_activation",
-                 "(enum int, default sigmoid) "
-                 "The activation type used in update gate and reset gate.")
-        .SetDefault(sigmoid)
-        .InEnum({identity, sigmoid, tanh, relu});
-    AddComment(R"DOC(
-GRUUnit Operator implements partial calculations of the GRU unit as following:
-
-$$
-update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
-reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
-output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
-output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
-$$
-
-which is same as one time step of GRU Operator.
-
-@note To implement the complete GRU unit, fully-connected operator must be 
-used before to feed xu, xr and xc as the Input of GRUUnit operator.
-
-)DOC");
-  }
-};
-
-class GRUUnitGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(%s) of GRUUnitGradOp should not be null.", "Input");
-    PADDLE_ENFORCE(ctx->HasInput("HiddenPrev"),
-                   "Input(%s) of GRUUnitGradOp should not be null.",
-                   "HiddenPrev");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(%s) of GRUUnitGradOp should not be null.", "Weight");
-    PADDLE_ENFORCE(ctx->HasInput("Gate"),
-                   "Input(%s) of GRUUnitGradOp should not be null.", "Gate");
-    PADDLE_ENFORCE(ctx->HasInput("ResetHiddenPrev"),
-                   "Input(%s) of GRUUnitGradOp should not be null.",
-                   "ResetHiddenPrev");
-    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
-                   "Input(%s) of GRUUnitGradOp should not be null.", "Hidden");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
-                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
-                   "Hidden");
-    auto input_dims = ctx->GetInputDim("Input");
-    auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev");
-    auto weight_dims = ctx->GetInputDim("Weight");
-    // int batch_size = input_dims[0];
-    int input_size = input_dims[1];
-    int frame_size = hidden_prev_dims[1];
-    int weight_height = weight_dims[0];
-    int weight_width = weight_dims[1];
-    PADDLE_ENFORCE_EQ(
-        input_size, frame_size * 3,
-        "The input_size must be 3 times of frame_size in GRUUnitOp.");
-    PADDLE_ENFORCE_EQ(
-        weight_height, frame_size,
-        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-    PADDLE_ENFORCE_EQ(
-        weight_width, frame_size * 3,
-        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-    if (ctx->HasInput("Bias")) {
-      auto bias_dims = ctx->GetInputDim("Bias");
-      int bias_height = bias_dims[0];
-      int bias_width = bias_dims[1];
-      PADDLE_ENFORCE_EQ(bias_height, 1,
-                        "The shape of Bias must be [1, frame_size * 3].");
-      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
-                        "The shape of Bias must be [1, frame_size * 3].");
-      auto bias_grad_name = framework::GradVarName("Bias");
-      if (ctx->HasOutput(bias_grad_name))
-        ctx->SetOutputDim(bias_grad_name, bias_dims);
-    }
-    auto input_grad_name = framework::GradVarName("Input");
-    if (ctx->HasOutput(input_grad_name))
-      ctx->SetOutputDim(input_grad_name, input_dims);
-    auto hidden_prev_grad_name = framework::GradVarName("HiddenPrev");
-    if (ctx->HasOutput(hidden_prev_grad_name))
-      ctx->SetOutputDim(hidden_prev_grad_name, hidden_prev_dims);
-    auto weight_grad_name = framework::GradVarName("Weight");
-    if (ctx->HasOutput(weight_grad_name))
-      ctx->SetOutputDim(weight_grad_name, weight_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad,
-            ops::GRUUnitGradOp);
-REGISTER_OP_CPU_KERNEL(
-    gru_unit, ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    gru_unit_grad,
-    ops::GRUUnitGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GRUUnitGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/gru_unit_op.cu b/paddle/operators/gru_unit_op.cu
deleted file mode 100644
index 95c8c23dadadf0e053012c86d44346ee31565cfc..0000000000000000000000000000000000000000
--- a/paddle/operators/gru_unit_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/gru_unit_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    gru_unit, ops::GRUUnitKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GRUUnitKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    gru_unit_grad,
-    ops::GRUUnitGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GRUUnitGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/gru_unit_op.h b/paddle/operators/gru_unit_op.h
deleted file mode 100644
index a77be46718b766d9a0a8b8fb4cf2316b44687db8..0000000000000000000000000000000000000000
--- a/paddle/operators/gru_unit_op.h
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/operators/activation_op.h"
-#include "paddle/operators/math/math_function.h"
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
-
-template <typename DeviceContext, typename T>
-class GRUUnitKernel : public framework::OpKernel<T> {
- public:
-  template <typename Device, typename X, typename Y>
-  void ActCompute(const int act_type, const Device& d, X x, Y y) const {
-    if (act_type == identity)
-      y.device(d) = x;
-    else if (act_type == sigmoid)
-      SigmoidFunctor<T>()(d, x, y);
-    else if (act_type == tanh)
-      TanhFunctor<T>()(d, x, y);
-    else if (act_type == relu)
-      ReluFunctor<T>()(d, x, y);
-    else
-      PADDLE_THROW("unsupported activation type");
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("Input");
-    auto* hidden_prev = context.Input<Tensor>("HiddenPrev");
-    auto* weight = context.Input<Tensor>("Weight");
-    auto* bias = context.Input<Tensor>("Bias");
-    auto* gate = context.Output<Tensor>("Gate");
-    gate->mutable_data<T>(context.GetPlace());
-    auto* reset_hidden_prev = context.Output<Tensor>("ResetHiddenPrev");
-    reset_hidden_prev->mutable_data<T>(context.GetPlace());
-    auto* hidden = context.Output<Tensor>("Hidden");
-    hidden->mutable_data<T>(context.GetPlace());
-
-    int batch_size = input->dims()[0];
-    int frame_size = hidden_prev->dims()[1];
-
-    auto x = EigenMatrix<T>::From(*input);
-    auto h_p = EigenMatrix<T>::From(*hidden_prev);
-    auto g = EigenMatrix<T>::From(*gate);
-    auto r_h_p = EigenMatrix<T>::From(*reset_hidden_prev);
-    auto h = EigenMatrix<T>::From(*hidden);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    // calculate unactivated gate outputs
-    if (bias) {
-      auto b = EigenMatrix<T>::From(*bias);
-      g.device(place) = x +
-                        b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
-                            .broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
-    } else {
-      g.device(place) = x;
-    }
-    const T* hidden_prev_data = hidden_prev->data<T>();
-    const T* weight_data = weight->data<T>();
-    T* gate_data = gate->data<T>();
-    T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
-    math::gemm<DeviceContext, T>(
-        context.template device_context<DeviceContext>(), false, false,
-        batch_size, 2 * frame_size, frame_size, 1, hidden_prev_data, frame_size,
-        weight_data, frame_size * 2, 1, gate_data, frame_size * 3);
-
-    // calculate activited gate
-    Eigen::array<int, 2> extents({{batch_size, frame_size}});
-    Eigen::array<int, 2> u_offsets({{0, 0}});
-    ActCompute(context.Attr<int>("gate_activation"), place,
-               g.slice(u_offsets, extents), g.slice(u_offsets, extents));
-    auto u = g.slice(u_offsets, extents);  // update gate
-    Eigen::array<int, 2> r_offsets({{0, frame_size}});
-    ActCompute(context.Attr<int>("gate_activation"), place,
-               g.slice(r_offsets, extents), g.slice(r_offsets, extents));
-    auto r = g.slice(r_offsets, extents);  // reset gate
-    r_h_p.device(place) = r * h_p;         // reset previous hidden state
-    math::gemm<DeviceContext, T>(
-        context.template device_context<DeviceContext>(), false, false,
-        batch_size, frame_size, frame_size, 1, reset_hidden_prev_data,
-        frame_size, weight_data + frame_size * frame_size * 2, frame_size, 1,
-        gate_data + frame_size * 2, frame_size * 3);
-
-    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
-    ActCompute(context.Attr<int>("activation"), place,
-               g.slice(c_offsets, extents), g.slice(c_offsets, extents));
-    auto c = g.slice(c_offsets, extents);  // output candidate
-
-    // calculate final output
-    h.device(place) = u * (c - h_p) + h_p;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GRUUnitGradKernel : public framework::OpKernel<T> {
- public:
-  template <typename Device, typename X, typename Y, typename DX, typename DY>
-  void ActGradCompute(const int act_type, const Device& d, X x, Y y, DX dx,
-                      DY dy) const {
-    // x is dummy and won't be used even in Relu(use y instead)
-    if (act_type == identity)
-      dx.device(d) = dy;
-    else if (act_type == sigmoid)
-      SigmoidGradFunctor<T>()(d, x, y, dy, dx);
-    else if (act_type == tanh)
-      TanhGradFunctor<T>()(d, x, y, dy, dx);
-    else if (act_type == relu)
-      ReluGradFunctor<T>()(d, x, y, dy, dx);
-    else
-      PADDLE_THROW("unsupported activation type");
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("Input");
-    auto* hidden_prev = context.Input<Tensor>("HiddenPrev");
-    auto* weight = context.Input<Tensor>("Weight");
-    auto* gate = context.Input<Tensor>("Gate");
-    auto* reset_hidden_prev = context.Input<Tensor>("ResetHiddenPrev");
-    auto* hidden_grad = context.Input<Tensor>(framework::GradVarName("Hidden"));
-    auto* input_grad = context.Output<Tensor>(framework::GradVarName("Input"));
-    auto* hidden_prev_grad =
-        context.Output<Tensor>(framework::GradVarName("HiddenPrev"));
-    auto* weight_grad =
-        context.Output<Tensor>(framework::GradVarName("Weight"));
-    auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
-    Tensor gate_grad;
-    Tensor reset_hidden_prev_grad;
-
-    const T* hidden_prev_data = hidden_prev->data<T>();
-    const T* weight_data = weight->data<T>();
-    T* gate_grad_data =
-        gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
-    const T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
-    T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.mutable_data<T>(
-        reset_hidden_prev->dims(), context.GetPlace());
-
-    auto h_p = EigenMatrix<T>::From(*hidden_prev);
-    auto g = EigenMatrix<T>::From(*gate);
-    auto d_h = EigenMatrix<T>::From(*hidden_grad);
-    auto d_g = EigenMatrix<T>::From(gate_grad);
-    auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    int batch_size = input->dims()[0];
-    int frame_size = hidden_prev->dims()[1];
-
-    Eigen::array<int, 2> extents({{batch_size, frame_size}});
-    Eigen::array<int, 2> u_offsets({{0, 0}});
-    auto u = g.slice(u_offsets, extents);  // update gate
-    Eigen::array<int, 2> r_offsets({{0, frame_size}});
-    auto r = g.slice(r_offsets, extents);  // reset gate
-    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
-    auto c = g.slice(c_offsets, extents);  // output candidate
-
-    // backward for unactivated update gate
-    ActGradCompute(context.Attr<int>("gate_activation"), place, u, u,
-                   d_g.slice(u_offsets, extents), d_h * (c - h_p));
-    // backward for unactivated output candidate
-    ActGradCompute(context.Attr<int>("activation"), place, c, c,
-                   d_g.slice(c_offsets, extents), d_h * u);
-    // backward for reset_hidden_prev
-    math::gemm<DeviceContext, T>(
-        context.template device_context<DeviceContext>(), false, true,
-        batch_size, frame_size, frame_size, 1, gate_grad_data + frame_size * 2,
-        frame_size * 3, weight_data + frame_size * frame_size * 2, frame_size,
-        0, reset_hidden_prev_grad_data, frame_size);
-    // backward for unactivated reset gate
-    ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
-                   d_g.slice(r_offsets, extents), d_r_h_p * h_p);
-    // backward for weight
-    if (weight_grad) {
-      T* weight_grad_data = weight_grad->mutable_data<T>(context.GetPlace());
-      // backward for state_weight
-      math::gemm<DeviceContext, T>(
-          context.template device_context<DeviceContext>(), true, false,
-          frame_size, frame_size, batch_size, 1, reset_hidden_prev_data,
-          frame_size, gate_grad_data + frame_size * 2, frame_size * 3, 0,
-          weight_grad_data + frame_size * frame_size * 2, frame_size);
-
-      // backward for update_gate_weight and reset_gate_weight
-      math::gemm<DeviceContext, T>(
-          context.template device_context<DeviceContext>(), true, false,
-          frame_size, frame_size * 2, batch_size, 1, hidden_prev_data,
-          frame_size, gate_grad_data, frame_size * 3, 0, weight_grad_data,
-          frame_size * 2);
-    }
-    // backward for hidden_prev
-    if (hidden_prev_grad) {
-      T* hidden_prev_grad_data =
-          hidden_prev_grad->mutable_data<T>(context.GetPlace());
-      auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
-      d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u);
-      math::gemm<DeviceContext, T>(
-          context.template device_context<DeviceContext>(), false, true,
-          batch_size, frame_size, frame_size * 2, 1, gate_grad_data,
-          frame_size * 3, weight_data, frame_size * 2, 1, hidden_prev_grad_data,
-          frame_size);
-    }
-    // backward for input
-    if (input_grad) {
-      input_grad->mutable_data<T>(context.GetPlace());
-      auto d_x = EigenMatrix<T>::From(*input_grad);
-      d_x.device(place) = d_g;
-    }
-    // backward for bias
-    if (bias_grad) {
-      bias_grad->mutable_data<T>(context.GetPlace());
-      auto d_b = EigenVector<T>::Flatten(*bias_grad);
-      d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/hinge_loss_op.cc b/paddle/operators/hinge_loss_op.cc
deleted file mode 100644
index 19d2e9dc56fe11f9dfb13e8cb271a23e128bf91b..0000000000000000000000000000000000000000
--- a/paddle/operators/hinge_loss_op.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/hinge_loss_op.h"
-
-namespace paddle {
-namespace operators {
-
-class HingeLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Logits"),
-                   "Input(Logits) must be initialized.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
-                   "Input(Labels) must be initialized.");
-
-    auto pred_dims = ctx->GetInputDim("Logits");
-    auto label_dims = ctx->GetInputDim("Labels");
-
-    PADDLE_ENFORCE_EQ(pred_dims, label_dims);
-    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
-                      "The rank of Input(Logits) must be 2 and the shape is "
-                      "[batch_size, 1].");
-    PADDLE_ENFORCE_EQ(pred_dims[1], 1,
-                      "Each row of Input(Logits) contains a real value, "
-                      "so the 2nd dimension of Input(Logits) must be 1.");
-
-    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
-    ctx->ShareLoD("Logits", "Loss");
-  }
-};
-
-template <typename AttrType>
-class HingeLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  HingeLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Logits",
-             "The input value (Logits) of Hinge loss op."
-             "Logits is a 2-D tensor with shape [batch_size, 1].");
-    AddInput("Labels",
-             "The target value (Labels) of Hinge loss op."
-             "Labels is a 2-D tensor with shape [batch_size, 1].");
-    AddOutput("Loss",
-              "The output tensor with shape [batch_size, 1] "
-              "which represents the hinge loss.");
-    AddComment(R"DOC(
-HingeLoss Operator.
-
-Let x be a logit (prediction) and y be the actual label. The logit can
-take any values from (-inf, inf), but the labels should be either -1 or 1.
-Then, the hinge loss is computed as follows:
-
-$$
-L_(x, y) = max(1 - y.x, 0) 
-$$
-
-Note that the labels passed as input will have values as either 0 or 1.
-
-)DOC");
-  }
-};
-
-class HingeLossGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Logits"),
-                   "Input(Logits) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
-                   "Input(Labels) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
-                   "Input(Loss@GRAD) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
-                   "Input(Logits@GRAD) should not be null.");
-
-    auto pred_dims = ctx->GetInputDim("Logits");
-    auto lab_dims = ctx->GetInputDim("Labels");
-    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
-
-    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
-
-    auto pred_grad_name = framework::GradVarName("Logits");
-    ctx->SetOutputDim(pred_grad_name, pred_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
-            hinge_loss_grad, ops::HingeLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    hinge_loss,
-    ops::HingeLossKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    hinge_loss_grad,
-    ops::HingeLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/hinge_loss_op.cu b/paddle/operators/hinge_loss_op.cu
deleted file mode 100644
index b9cfbc50c49c6cc902cb3667200c12c74fb5d13d..0000000000000000000000000000000000000000
--- a/paddle/operators/hinge_loss_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/hinge_loss_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    hinge_loss,
-    ops::HingeLossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    hinge_loss_grad,
-    ops::HingeLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/hinge_loss_op.h b/paddle/operators/hinge_loss_op.h
deleted file mode 100644
index 91369cfb8a5d4f40be9e6249b50079ba2b550003..0000000000000000000000000000000000000000
--- a/paddle/operators/hinge_loss_op.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class HingeLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* pred = context.Input<framework::Tensor>("Logits");
-    auto* label = context.Input<framework::Tensor>("Labels");
-    auto* loss = context.Output<framework::Tensor>("Loss");
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto x = framework::EigenVector<T>::Flatten(*pred);
-    auto y = framework::EigenVector<T>::Flatten(*label);
-    loss->mutable_data<T>(context.GetPlace());
-    auto l = framework::EigenVector<T>::Flatten(*loss);
-    l.device(place) =
-        (static_cast<T>(1) - x * (static_cast<T>(2) * y - static_cast<T>(1)))
-            .cwiseMax(static_cast<T>(0));
-  }
-};
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class HingeLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* pred = context.Input<framework::Tensor>("Logits");
-    auto* label = context.Input<framework::Tensor>("Labels");
-    auto* dloss =
-        context.Input<framework::Tensor>(framework::GradVarName("Loss"));
-    auto* dpred =
-        context.Output<framework::Tensor>(framework::GradVarName("Logits"));
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto x = framework::EigenVector<T>::Flatten(*pred);
-    auto y = framework::EigenVector<T>::Flatten(*label);
-    auto dl = framework::EigenVector<T>::Flatten(*dloss);
-
-    if (dpred) {
-      dpred->mutable_data<T>(context.GetPlace());
-      auto dx = framework::EigenVector<T>::Flatten(*dpred);
-      auto alt_labels = static_cast<T>(2) * y - static_cast<T>(1);
-      dx.device(place) =
-          dl * ((x * alt_labels) < static_cast<T>(1)).template cast<T>() *
-          (-alt_labels);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/huber_loss_op.cc b/paddle/operators/huber_loss_op.cc
deleted file mode 100644
index 5c92f2c7b2d2f701bcc487716db41a0cce91002f..0000000000000000000000000000000000000000
--- a/paddle/operators/huber_loss_op.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/huber_loss_op.h"
-
-namespace paddle {
-namespace operators {
-
-class HuberLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must be initialized.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must be initialized.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_EQ(x_dims, y_dims);
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2,
-                      "The rank of Input(X) must be 2 and the shape is "
-                      "[batch_size, 1].");
-    PADDLE_ENFORCE_EQ(x_dims[1], 1,
-                      "Each row of Input(X) contains a real value, "
-                      "so the 2nd dimension of Input(X) must be 1.");
-
-    ctx->SetOutputDim("Residual", x_dims);
-    ctx->SetOutputDim("Out", {x_dims[0], 1});
-    ctx->ShareLoD("X", "Out");
-  }
-};
-
-template <typename AttrType>
-class HuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  HuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "The input value of huber loss op."
-             "X is a 2-D tensor with shape [batch_size, 1].");
-    AddInput("Y",
-             "The target value of huber loss op."
-             "Y is a 2-D tensor with shape [batch_size, 1].");
-    AddOutput("Residual",
-              "Intermediate tensor to cache residual value between Y and X."
-              "The shape is same as Input(X) and will be reused in backward.")
-        .AsIntermediate();
-    AddOutput("Out",
-              "The output tensor with shape [batch_size, 1] "
-              "which represents the huber loss.");
-    AddAttr<AttrType>("delta", "Hyper parameter in huber loss.");
-    AddComment(R"DOC(
-HuberLoss Operator.
-
-Huber loss is a loss function used in robust regression. We define X as the
-input value and Y as the target value. Huber loss can evaluate the fitness of
-X to Y. Different from MSE loss, Huber loss is more robust for outliers. The
-shape of X and Y are [batch_size, 1]. The equation is:
-
-$$
-Out_{\delta}(X, Y)_i =
-\begin{cases}
-0.5 * (Y_i - X_i)^2,
-\quad |Y_i - X_i| \leq \delta \\
-\delta * (|Y_i - X_i| - 0.5 * \delta),
-\quad otherwise
-\end{cases}
-$$
-
-In the above equation, $Out_\delta(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
-element of Out, X and Y.
-
-)DOC");
-  }
-};
-
-class HuberLossGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Residual"),
-                   "Input(Residual) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    auto residual_dims = ctx->GetInputDim("Residual");
-    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_EQ(residual_dims, x_dims);
-    PADDLE_ENFORCE_EQ(out_grad_dims, x_dims);
-
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, y_dims);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
-            huber_loss_grad, ops::HuberLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    huber_loss,
-    ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/huber_loss_op.cu b/paddle/operators/huber_loss_op.cu
deleted file mode 100644
index ccc83a16ba271f0aa879c5c075dce1932dd40494..0000000000000000000000000000000000000000
--- a/paddle/operators/huber_loss_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/huber_loss_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    huber_loss,
-    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/huber_loss_op.h b/paddle/operators/huber_loss_op.h
deleted file mode 100644
index 4dd20e8b080ab8bd2e61830241d64ee8546a80ec..0000000000000000000000000000000000000000
--- a/paddle/operators/huber_loss_op.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename T>
-struct HuberLossForward {
-  HOSTDEVICE HuberLossForward(const T& delta) : delta(delta) {}
-
-  HOSTDEVICE T operator()(const T& val) const {
-    T abs_val = std::abs(val);
-    if (abs_val <= delta) {
-      return static_cast<T>(0.5) * val * val;
-    } else {
-      return delta * (abs_val - static_cast<T>(0.5) * delta);
-    }
-  }
-
-  T delta;
-};
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class HuberLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto* in1 = context.Input<Tensor>("Y");
-    auto* out0 = context.Output<Tensor>("Residual");
-    auto* out1 = context.Output<Tensor>("Out");
-    auto delta = static_cast<T>(context.Attr<AttrType>("delta"));
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto x = EigenVector<T>::Flatten(*in0);
-    auto y = EigenVector<T>::Flatten(*in1);
-    out0->mutable_data<T>(context.GetPlace());
-    auto residual = EigenVector<T>::Flatten(*out0);
-    residual.device(place) = y - x;
-    out1->mutable_data<T>(context.GetPlace());
-    auto loss = EigenVector<T>::Flatten(*out1);
-    loss.device(place) = residual.unaryExpr(HuberLossForward<T>(delta));
-  }
-};
-
-template <typename T>
-struct HuberLossBackward {
-  HOSTDEVICE HuberLossBackward(const T& delta, T sign)
-      : sign(sign), delta(delta) {}
-
-  HOSTDEVICE T operator()(const T& val) const {
-    T abs_val = std::abs(val);
-    if (abs_val <= delta) {
-      return sign * val;
-    } else {
-      if (val > 0) {
-        return sign * delta;
-      } else {
-        return -1 * sign * delta;
-      }
-    }
-  }
-
-  T sign;
-  T delta;
-};
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class HuberLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("Residual");
-    auto* in1 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* out1 = context.Output<Tensor>(framework::GradVarName("Y"));
-    auto delta = static_cast<T>(context.op().Attr<AttrType>("delta"));
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto residual = EigenVector<T>::Flatten(*in0);
-    auto out_grad = EigenVector<T>::Flatten(*in1);
-
-    if (out0) {
-      out0->mutable_data<T>(context.GetPlace());
-      auto x_grad = EigenVector<T>::Flatten(*out0);
-      x_grad.device(place) =
-          out_grad * residual.unaryExpr(HuberLossBackward<T>(delta, -1.0));
-    }
-
-    if (out1) {
-      out1->mutable_data<T>(context.GetPlace());
-      auto y_grad = EigenVector<T>::Flatten(*out1);
-      y_grad.device(place) =
-          out_grad * residual.unaryExpr(HuberLossBackward<T>(delta, 1.0));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/im2sequence_op.cc b/paddle/operators/im2sequence_op.cc
deleted file mode 100644
index 31baaedf6914b1a6939fc762491ef35013db4bb6..0000000000000000000000000000000000000000
--- a/paddle/operators/im2sequence_op.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/im2sequence_op.h"
-
-namespace paddle {
-namespace operators {
-
-class Im2SequenceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of Im2SequenceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of Im2SequenceOp op should not be null.");
-
-    auto in_dim = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(in_dim.size(), 4,
-                      "Input(X) format must be 4D tensor, eg., NCHW.");
-
-    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
-    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-
-    int batch_size = in_dim[0];
-    int img_channels = in_dim[1];
-    int img_height = in_dim[2];
-    int img_width = in_dim[3];
-
-    int output_height = OutputSize(img_height, kernels[0], paddings[0],
-                                   paddings[2], strides[0]);
-    int output_width =
-        OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]);
-
-    ctx->SetOutputDim("Out", {batch_size * output_height * output_width,
-                              img_channels * kernels[0] * kernels[1]});
-  }
-};
-
-class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  Im2SequenceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(Tensor) The input tensor has NCHW format."
-             "N: batch size"
-             "C: channels"
-             "H: height"
-             "W: width");
-    AddOutput("Out", "(LodTensor) The output data of im2sequence op,");
-    AddAttr<std::vector<int>>("kernels",
-                              "(vector<int>), the "
-                              "kernels(kernel_height, kernel_width)");
-    AddAttr<std::vector<int>>("strides",
-                              "(vector<int> default:{1, 1}), the "
-                              "strides(h_stride, w_stride)")
-        .SetDefault({1, 1});
-    AddAttr<std::vector<int>>("paddings",
-                              "(vector<int> default:{0, 0, 0, 0}), the "
-                              "paddings(up_pad, left_pad, down_pad, right_pad)")
-        .SetDefault({0, 0, 0, 0});
-    AddComment(R"DOC(
-This op uses kernels to scan images and converts these images to sequences.
-After expanding, The number of time steps are output_height * output_width
-and the dimension of each time step is kernel_height * kernel_width * channels,
-in which:
-
-output_height =
-    1 + (padding_height + padding_down + img_height - kernel_height + stride_height - 1) /
-            stride_height;
-output_width =
-    1 + (padding_left + padding+right + img_width - kernel_width + stride_width - 1) /
-            stride_width;
-
-This op can be used after convolution neural network, and before recurrent neural network.
-
-Given:
-
-x = [[[[ 6.  2.  1.]
-       [ 8.  3.  5.]
-       [ 0.  2.  6.]]
-
-      [[ 2.  4.  4.]
-       [ 6.  3.  0.]
-       [ 6.  4.  7.]]]
-
-     [[[ 6.  7.  1.]
-       [ 5.  7.  9.]
-       [ 2.  4.  8.]]
-
-      [[ 1.  2.  1.]
-       [ 1.  3.  5.]
-       [ 9.  0.  8.]]]]
-x.dims = {2, 2, 3, 3}
-
-And:
-
-kernels = [2, 2]
-strides = [1, 1]
-paddings = [0, 0, 0, 0]
-
-Then:
-
-output.data = [[ 6.  2.  8.  3.  2.  4.  6.  3.]
-               [ 2.  1.  3.  5.  4.  4.  3.  0.]
-               [ 8.  3.  0.  2.  6.  3.  6.  4.]
-               [ 3.  5.  2.  6.  3.  0.  4.  7.]
-               [ 6.  7.  5.  7.  1.  2.  1.  3.]
-               [ 7.  1.  7.  9.  2.  1.  3.  5.]
-               [ 5.  7.  2.  4.  1.  3.  9.  0.]
-               [ 7.  9.  4.  8.  3.  5.  0.  8.]]
-output.dims = {8, 9}
-output.lod = [[0, 4, 8]]
-
-)DOC");
-  }
-};
-
-class Im2SequenceGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) shouldn't be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
-            im2sequence_grad, ops::Im2SequenceGradOp);
-REGISTER_OP_CPU_KERNEL(
-    im2sequence,
-    ops::Im2SequenceKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    im2sequence_grad,
-    ops::Im2SequenceGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/im2sequence_op.cu b/paddle/operators/im2sequence_op.cu
deleted file mode 100644
index 9db7529112f2710d6ff4af2b444e304543486de3..0000000000000000000000000000000000000000
--- a/paddle/operators/im2sequence_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/im2sequence_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    im2sequence,
-    ops::Im2SequenceKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    im2sequence_grad,
-    ops::Im2SequenceGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/im2sequence_op.h b/paddle/operators/im2sequence_op.h
deleted file mode 100644
index f33aec71a92a65ec0e4114530d70e36c9dc1be04..0000000000000000000000000000000000000000
--- a/paddle/operators/im2sequence_op.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   You may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/data_layout.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/im2col.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-inline int OutputSize(int input_size, int filter_size, int padding_0,
-                      int padding_1, int stride) {
-  const int output_size =
-      (input_size + padding_0 + padding_1 - filter_size) / stride + 1;
-  return output_size;
-}
-
-template <typename DeviceContext, typename T>
-class Im2SequenceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* in = ctx.Input<Tensor>("X");
-    LoDTensor* out = ctx.Output<LoDTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    // TODO(wanghaoshuang): Add layout checker after 'set_layout'
-    // being available for python API
-    // PADDLE_ENFORCE_EQ(in->layout(), framework::DataLayout::kNCHW,
-    //                  "Input(X) layout must be NCHW");
-    auto in_dim = in->dims();
-    int batch_size = in_dim[0];
-    int img_channels = in_dim[1];
-    int img_height = in_dim[2];
-    int img_width = in_dim[3];
-
-    auto kernels = ctx.Attr<std::vector<int>>("kernels");
-    auto strides = ctx.Attr<std::vector<int>>("strides");
-    auto paddings = ctx.Attr<std::vector<int>>("paddings");
-    int output_height = OutputSize(img_height, kernels[0], paddings[0],
-                                   paddings[2], strides[0]);
-    int output_width =
-        OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]);
-
-    const std::vector<int> dilations({1, 1});
-
-    auto out_dims = out->dims();
-    out->Resize({batch_size, out->numel() / batch_size});
-    for (int i = 0; i < batch_size; i++) {
-      const Tensor src =
-          in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
-      Tensor dst = out->Slice(i, i + 1).Resize(
-          {output_height, output_width, img_channels, kernels[0], kernels[1]});
-
-      math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      f(dev_ctx, src, dilations, strides, paddings, &dst);
-    }
-    out->Resize(out_dims);
-
-    // set lod information
-    // TODO(wanghaoshuang): Move this to InferShape
-    framework::LoD lod(1);
-    lod[0].reserve(batch_size + 1);
-    for (int i = 0, offset = 0; i < batch_size + 1; ++i) {
-      lod[0].push_back(offset);
-      offset += output_height * output_width;
-    }
-    out->set_lod(lod);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Im2SequenceGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    Tensor* d_out =
-        const_cast<Tensor*>(ctx.Input<Tensor>(framework::GradVarName("Out")));
-    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    d_x->mutable_data<T>(ctx.GetPlace());
-
-    auto x_v = framework::EigenVector<T>::Flatten(*d_x);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    x_v.device(place) = x_v.constant(0.0);
-
-    auto in_dim = in->dims();
-    int batch_size = in_dim[0];
-    int img_channels = in_dim[1];
-    int img_height = in_dim[2];
-    int img_width = in_dim[3];
-
-    auto kernels = ctx.Attr<std::vector<int>>("kernels");
-    auto strides = ctx.Attr<std::vector<int>>("strides");
-    auto paddings = ctx.Attr<std::vector<int>>("paddings");
-    int output_height = OutputSize(img_height, kernels[0], paddings[0],
-                                   paddings[2], strides[0]);
-    int output_width =
-        OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]);
-
-    const std::vector<int> dilations({1, 1});
-
-    auto d_out_dims = d_out->dims();
-    d_out->Resize({batch_size, d_out->numel() / batch_size});
-    for (int i = 0; i < batch_size; i++) {
-      Tensor dst =
-          d_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
-      const Tensor src = d_out->Slice(i, i + 1).Resize(
-          {output_height, output_width, img_channels, kernels[0], kernels[1]});
-      math::Col2ImFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      f(dev_ctx, src, dilations, strides, paddings, &dst);
-    }
-    d_out->Resize(d_out_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/increment_op.cc b/paddle/operators/increment_op.cc
deleted file mode 100644
index e0b80cc4e74429dee1b9a25e41b116970ad4de2a..0000000000000000000000000000000000000000
--- a/paddle/operators/increment_op.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class IncrementInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of IncrementOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of IncrementOp should not be null.");
-    PADDLE_ENFORCE_EQ(1, framework::product(ctx->GetInputDim("X")));
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-  }
-};
-
-struct IncrementFunctor {
-  IncrementFunctor(const framework::LoDTensor &x, framework::LoDTensor *out,
-                   float value)
-      : x_(x), out_(out), value_(value) {}
-
-  template <typename T>
-  void operator()() const {
-    *out_->data<T>() = *x_.data<T>() + static_cast<T>(value_);
-  }
-
-  const framework::LoDTensor &x_;
-  framework::LoDTensor *out_;
-  float value_;
-};
-
-class IncrementOp : public framework::OperatorBase {
- public:
-  IncrementOp(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
-    auto &out =
-        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
-
-    PADDLE_ENFORCE(platform::is_cpu_place(x.place()));
-    out.Resize(x.dims());
-    out.mutable_data(x.place(), x.type());
-    float value = Attr<float>("step");
-    VLOG(10) << Output("Out") << " increase " << Input("X") << " with "
-             << value;
-    framework::VisitDataType(framework::ToDataType(out.type()),
-                             IncrementFunctor(x, &out, value));
-  }
-};
-
-class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  IncrementOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) The input tensor of increment operator");
-    AddOutput("Out", "(Tensor) The output tensor of increment operator.");
-    AddAttr<float>("step",
-                   "(float, default 1.0) "
-                   "The step size by which the "
-                   "input tensor will be incremented.")
-        .SetDefault(1.0);
-    AddComment(R"DOC(
-Increment Operator.
-
-The equation is: 
-$$Out = X + step$$
-
-)DOC");
-  }
-};
-
-class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("increment");
-    grad_op->SetInput("X", Output("Out"));
-    grad_op->SetOutput("Out", Input("X"));
-    grad_op->SetAttr("step", -boost::get<float>(GetAttr("step")));
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementInferShape,
-                  ops::IncrementOpMaker, ops::IncrementGradOpMaker);
diff --git a/paddle/operators/iou_similarity_op.cc b/paddle/operators/iou_similarity_op.cc
deleted file mode 100755
index c520b28b83e66dbf53d2e19985370be4a2f69e23..0000000000000000000000000000000000000000
--- a/paddle/operators/iou_similarity_op.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/iou_similarity_op.h"
-
-namespace paddle {
-namespace operators {
-
-class IOUSimilarityOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of IOUSimilarityOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"),
-                   "Input(Y) of IOUSimilarityOp should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The rank of Input(X) must be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[1], 4UL, "The shape of X is [N, 4]");
-    PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The rank of Input(Y) must be 2.");
-    PADDLE_ENFORCE_EQ(y_dims[1], 4UL, "The shape of Y is [M, 4]");
-
-    ctx->ShareLoD("X", /*->*/ "Out");
-    ctx->SetOutputDim("Out", framework::make_ddim({x_dims[0], y_dims[0]}));
-  }
-};
-
-class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  IOUSimilarityOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(LoDTensor, default LoDTensor<float>) "
-             "Box list X is a 2-D LoDTensor with shape [N, 4] holds N boxes, "
-             "each box is represented as [xmin, ymin, xmax, ymax], "
-             "the shape of X is [N, 4]. [xmin, ymin] is the left top "
-             "coordinate of the box if the input is image feature map, they "
-             "are close to the origin of the coordinate system. "
-             "[xmax, ymax] is the right bottom coordinate of the box. "
-             "This tensor can contain LoD information to represent a batch "
-             "of inputs. One instance of this batch can contain different "
-             "numbers of entities.");
-    AddInput("Y",
-             "(Tensor, default Tensor<float>) "
-             "Box list Y holds M boxes, each box is represented as "
-             "[xmin, ymin, xmax, ymax], the shape of X is [N, 4]. "
-             "[xmin, ymin] is the left top coordinate of the box if the "
-             "input is image feature map, and [xmax, ymax] is the right "
-             "bottom coordinate of the box.");
-
-    AddOutput("Out",
-              "(LoDTensor, the lod is same as input X) The output of "
-              "iou_similarity op, a tensor with shape [N, M] "
-              "representing pairwise iou scores.");
-
-    AddComment(R"DOC(
-IOU Similarity Operator.
-Computes intersection-over-union (IOU) between two box lists.
- Box list 'X' should be a LoDTensor and 'Y' is a common Tensor,
- boxes in 'Y' are shared by all instance of the batched inputs of X.
- Given two boxes A and B, the calculation of IOU is as follows:
-
-$$
-IOU(A, B) = 
-\frac{area(A\cap B)}{area(A)+area(B)-area(A\cap B)}
-$$
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(iou_similarity, ops::IOUSimilarityOp,
-                             ops::IOUSimilarityOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    iou_similarity,
-    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/iou_similarity_op.cu b/paddle/operators/iou_similarity_op.cu
deleted file mode 100755
index fa5052624618c35875b241419946f69b776c81d4..0000000000000000000000000000000000000000
--- a/paddle/operators/iou_similarity_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/iou_similarity_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    iou_similarity,
-    ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/iou_similarity_op.h b/paddle/operators/iou_similarity_op.h
deleted file mode 100644
index e36177069d7b18ea23759f99c4679218fbfd32b8..0000000000000000000000000000000000000000
--- a/paddle/operators/iou_similarity_op.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/for_range.h"
-
-template <typename T>
-inline HOSTDEVICE T IOUSimilarity(T xmin1, T ymin1, T xmax1, T ymax1, T xmin2,
-                                  T ymin2, T xmax2, T ymax2) {
-  constexpr T zero = static_cast<T>(0);
-  T area1 = (ymax1 - ymin1) * (xmax1 - xmin1);
-  T area2 = (ymax2 - ymin2) * (xmax2 - xmin2);
-  T inter_xmax = xmax1 > xmax2 ? xmax2 : xmax1;
-  T inter_ymax = ymax1 > ymax2 ? ymax2 : ymax1;
-  T inter_xmin = xmin1 > xmin2 ? xmin1 : xmin2;
-  T inter_ymin = ymin1 > ymin2 ? ymin1 : ymin2;
-  T inter_height = inter_ymax - inter_ymin;
-  T inter_width = inter_xmax - inter_xmin;
-  inter_height = inter_height > zero ? inter_height : zero;
-  inter_width = inter_width > zero ? inter_width : zero;
-  T inter_area = inter_width * inter_height;
-  T union_area = area1 + area2 - inter_area;
-  T sim_score = inter_area / union_area;
-  return sim_score;
-}
-
-template <typename T>
-struct IOUSimilarityFunctor {
-  IOUSimilarityFunctor(const T* x, const T* y, T* z, int cols)
-      : x_(x), y_(y), z_(z), cols_(static_cast<size_t>(cols)) {}
-
-  inline HOSTDEVICE void operator()(size_t row_id) const {
-    T x_min1 = x_[row_id * 4];
-    T y_min1 = x_[row_id * 4 + 1];
-    T x_max1 = x_[row_id * 4 + 2];
-    T y_max1 = x_[row_id * 4 + 3];
-    for (size_t i = 0; i < cols_; ++i) {
-      T x_min2 = y_[i * 4];
-      T y_min2 = y_[i * 4 + 1];
-      T x_max2 = y_[i * 4 + 2];
-      T y_max2 = y_[i * 4 + 3];
-
-      T sim = IOUSimilarity(x_min1, y_min1, x_max1, y_max1, x_min2, y_min2,
-                            x_max2, y_max2);
-
-      z_[row_id * cols_ + i] = sim;
-    }
-  }
-  const T* x_;
-  const T* y_;
-  T* z_;
-  const size_t cols_;
-};
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class IOUSimilarityKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::LoDTensor* in_x = ctx.Input<framework::LoDTensor>("X");
-    const framework::Tensor* in_y = ctx.Input<framework::Tensor>("Y");
-    framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
-
-    int x_n = in_x->dims()[0];
-    int y_n = in_y->dims()[0];
-    IOUSimilarityFunctor<T> functor(in_x->data<T>(), in_y->data<T>(),
-                                    out->mutable_data<T>(ctx.GetPlace()), y_n);
-
-    platform::ForRange<DeviceContext> for_range(
-        static_cast<const DeviceContext&>(ctx.device_context()), x_n);
-    for_range(functor);
-  }
-};  // namespace operators
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/is_empty_op.cc b/paddle/operators/is_empty_op.cc
deleted file mode 100644
index 492ae48845aa5aa123989e62d07f5ae899af6193..0000000000000000000000000000000000000000
--- a/paddle/operators/is_empty_op.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-constexpr char kInput[] = "X";
-constexpr char kOutput[] = "Out";
-
-class IsEmptyOp : public framework::OperatorBase {
- public:
-  IsEmptyOp(const std::string &type, const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    // get input
-    auto *var = scope.FindVar(Input(kInput));
-    PADDLE_ENFORCE_NOT_NULL(var);
-    auto &tensor = var->Get<framework::LoDTensor>();
-    // get output
-    auto *out = scope.FindVar(Output(kOutput));
-    PADDLE_ENFORCE_NOT_NULL(out);
-    auto *out_tensor = out->GetMutable<framework::LoDTensor>();
-
-    out_tensor->Resize({1});
-    out_tensor->mutable_data<bool>(platform::CPUPlace())[0] =
-        framework::product(tensor.dims()) == 0;
-  }
-};
-
-class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  IsEmptyOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(kInput, "(Tensor) Tensor which is to be checked.");
-    AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not.");
-    AddComment(R"DOC(
-IsEmpty Operator which checks whether a tensor is empty.
-
-It will just return product(tensor.ddims()) > 0;
-              )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_WITHOUT_GRADIENT(is_empty, paddle::operators::IsEmptyOp,
-                             paddle::operators::IsEmptyOpProtoMaker);
diff --git a/paddle/operators/l1_norm_op.cc b/paddle/operators/l1_norm_op.cc
deleted file mode 100644
index 1a5d6e19263325821dd220d8a31c0e34600b8220..0000000000000000000000000000000000000000
--- a/paddle/operators/l1_norm_op.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/l1_norm_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class L1NormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
-
-    ctx->SetOutputDim("Out", {1});
-  }
-};
-
-class L1NormGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output(X@GRAD) should be not null.");
-
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-
-class L1NormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  L1NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) The input of l1_norm op.");
-    AddOutput("Out", "(Scalar) The output of l1_norm op.");
-    AddComment(R"DOC(
-L1 Norm Operator.
-
-Computes the L1 norm of a tensor.
-
-$$Out = \sum{|X|}$$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, l1_norm_grad,
-            ops::L1NormGradOp);
-REGISTER_OP_CPU_KERNEL(
-    l1_norm, ops::L1NormKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    l1_norm_grad,
-    ops::L1NormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/l1_norm_op.cu b/paddle/operators/l1_norm_op.cu
deleted file mode 100644
index 7ecc774670a8480da46ac688f3635e04b1ab7c1f..0000000000000000000000000000000000000000
--- a/paddle/operators/l1_norm_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/l1_norm_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    l1_norm, ops::L1NormKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    l1_norm_grad,
-    ops::L1NormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/l1_norm_op.h b/paddle/operators/l1_norm_op.h
deleted file mode 100644
index 086d42705dceaf0cbd35ce8e5115156a76a0b6e8..0000000000000000000000000000000000000000
--- a/paddle/operators/l1_norm_op.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-// Out = sum(abs(X))
-template <typename DeviceContext, typename T>
-class L1NormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
-    Out->mutable_data<T>(context.GetPlace());
-
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto out = framework::EigenScalar<T>::From(*Out);
-    auto &place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    out.device(place) = x.abs().sum();
-  }
-};
-
-// dX = dout * sign(X)
-template <typename DeviceContext, typename T>
-class L1NormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *x = context.Input<framework::Tensor>("X");
-    const framework::Tensor *d_out =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE(d_out->numel() == 1, "L1 Norm Gradient should be scalar");
-    framework::Tensor *dx =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(context.GetPlace());
-
-    auto x_eigen = framework::EigenVector<T>::Flatten(*x);
-    auto d_out_eigen = framework::EigenVector<T>::Flatten(*d_out);
-    auto dx_eigen = framework::EigenVector<T>::Flatten(*dx);
-    auto &place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    Eigen::DSizes<int, 1> x_dsize(x->numel());
-    dx_eigen.device(place) = d_out_eigen.broadcast(x_dsize) * x_eigen.sign();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/label_smooth_op.cc b/paddle/operators/label_smooth_op.cc
deleted file mode 100644
index c89082f44b360cbd171eccb212674040b8688a46..0000000000000000000000000000000000000000
--- a/paddle/operators/label_smooth_op.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/label_smooth_op.h"
-
-namespace paddle {
-namespace operators {
-
-class LabelSmoothOp : public framework::OperatorWithKernel {
- public:
-  LabelSmoothOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of LabelSmoothOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of LabelSmoothOp should not be null.");
-    auto in_dims = ctx->GetInputDim("X");
-    if (ctx->HasInput("PriorDist")) {
-      auto noise_dims = ctx->GetInputDim("PriorDist");
-      auto noise_numel = paddle::framework::product(noise_dims);
-      PADDLE_ENFORCE(
-          in_dims[1] == noise_numel,
-          "The number of elements in Input(PriorDist) must be equal to the "
-          "dimension of each label.");
-    }
-    ctx->ShareLoD("X", /*->*/ "Out");
-    ctx->SetOutputDim("Out", in_dims);
-  }
-};
-
-class LabelSmoothOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LabelSmoothOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(LoDTensor) The input labels of LabelSmooth operator. This "
-             "input can be batched labels in one-hot encoding or output from "
-             "softmax, with shape [N x K], where N is the batch size and K is "
-             "the number of classes");
-    AddInput("PriorDist",
-             "(Tensor, optional)"
-             "The prior distribution to be added to the smoothed label. It is "
-             "fixed during training and the number of elements should be equal "
-             "to the dimension K of each label. Default is uniform "
-             "distribution and each element will be set to 1/K if not provided "
-             "in input.")
-        .AsDispensable();
-    AddOutput("Out",
-              "(loDTensor) The smoothed label of LabelSmooth operator. It has"
-              "the same shape and LoD with the Input(LoDTensor).");
-    AddAttr<float>("epsilon",
-                   "(float, default 0.0f)"
-                   "The smoothing parameter of LabelSmooth operator.")
-        .SetDefault(0.0f);
-    AddComment(R"DOC(
-LabelSmooth Operator.
-
-Label smoothing is a mechanism to regularize the classifier layer. In machine 
-learning, optimizing the log-likelihood of the correct label directly may 
-cause two problems. First, it may result in overfitting: if the model learns 
-to assign full probability to the ground-truth label for each training example,
-it is not guaranteed to generalize. Second, it encourages the differences 
-between the largest logit and all others to become large, reducing the ability 
-of the model to adapt. Label smoothing is proposed to encourage the model to 
-be less confident, which replaces the ground-truth label $y$ with the weighted 
-sum of itself and some fixed distribution $\mu$, i.e.
-
-$$
-    \tilde{y} = (1 - \epsilon) * y + \epsilon * \mu,
-$$
-
-where $(1 - \epsilon)$ and $\epsilon$ are the weights respectively, and 
-$\tilde{y}$ is the smoothed label. Usually uniform distribution is used for 
-$\mu$. This change in the ground-truth label is called label-smoothing 
-regularization or LSR.
-
-See more details about label smoothing in https://arxiv.org/abs/1512.00567.
-
-)DOC");
-  }
-};
-
-class LabelSmoothGradOp : public framework::OperatorWithKernel {
- public:
-  LabelSmoothGradOp(const std::string &type,
-                    const framework::VariableNameMap &inputs,
-                    const framework::VariableNameMap &outputs,
-                    const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) shouldn't be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OP(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
-            label_smooth_grad, ops::LabelSmoothGradOp);
-REGISTER_OP_CPU_KERNEL(
-    label_smooth,
-    ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    label_smooth_grad,
-    ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/label_smooth_op.cu b/paddle/operators/label_smooth_op.cu
deleted file mode 100644
index 5a0cec12bc58a56e4b0c3bd6fbc6c4754ef81fa4..0000000000000000000000000000000000000000
--- a/paddle/operators/label_smooth_op.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/label_smooth_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    label_smooth,
-    ops::LabelSmoothKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LabelSmoothKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    label_smooth_grad,
-    ops::LabelSmoothGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LabelSmoothGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/label_smooth_op.h b/paddle/operators/label_smooth_op.h
deleted file mode 100644
index 87bc9f793e3b4e249142710243c45d51f3a913b2..0000000000000000000000000000000000000000
--- a/paddle/operators/label_smooth_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class LabelSmoothKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_t = ctx.Output<framework::LoDTensor>("Out");
-    auto* in_t = ctx.Input<framework::LoDTensor>("X");
-    auto* dist_t = ctx.Input<framework::Tensor>("PriorDist");
-    auto label_dim = in_t->dims()[1];
-    out_t->mutable_data<T>(ctx.GetPlace());
-
-    auto epsilon = ctx.Attr<float>("epsilon");
-    auto out = framework::EigenVector<T>::Flatten(*out_t);
-    auto in = framework::EigenVector<T>::Flatten(*in_t);
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    if (dist_t) {
-      auto dist = framework::EigenVector<T>::Flatten(*dist_t);
-      out.device(dev) =
-          static_cast<T>(1 - epsilon) * in +
-          epsilon * dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
-    } else {
-      out.device(dev) = static_cast<T>(1 - epsilon) * in +
-                        static_cast<T>(epsilon / label_dim);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LabelSmoothGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_in_t = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    d_in_t->mutable_data<T>(ctx.GetPlace());
-
-    auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
-    auto d_in = framework::EigenVector<T>::Flatten(*d_in_t);
-
-    auto epsilon = ctx.Attr<float>("epsilon");
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    d_in.device(dev) = static_cast<T>(1 - epsilon) * d_out;
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc
deleted file mode 100644
index 1c6d2ae4d05becaeed34d66cad398cc90f9d3ece..0000000000000000000000000000000000000000
--- a/paddle/operators/layer_norm_op.cc
+++ /dev/null
@@ -1,370 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/layer_norm_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DataLayout = framework::DataLayout;
-
-template <typename T>
-using EigenMatrixMapRowMajor = Eigen::Map<
-    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
-template <typename T>
-using ConstEigenMatrixMapRowMajor = Eigen::Map<
-    const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
-
-class LayerNormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"),
-                   "Output(Y) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Mean"),
-                   "Output(Mean) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Variance"),
-                   "Output(Variance) of LayerNormOp should not be null.");
-
-    auto x_dim = ctx->GetInputDim("X");
-    auto begin_norm_axis = ctx->Attrs().Get<int>("begin_norm_axis");
-    PADDLE_ENFORCE_LT(begin_norm_axis, x_dim.size(),
-                      "'begin_norm_axis' must be less than the rank of X.");
-
-    auto matrix_dim = framework::flatten_to_2d(x_dim, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-    if (ctx->HasInput("Scale")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right);
-    }
-    if (ctx->HasInput("Bias")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right);
-    }
-
-    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
-    ctx->SetOutputDim("Mean", {left});
-    ctx->SetOutputDim("Variance", {left});
-    ctx->ShareLoD("X", "Y");
-  }
-};
-
-class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LayerNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(LoDTensor) The input tensor.");
-    AddInput("Scale",
-             "(Tensor, optional) Scale is a 1-dimensional tensor of size "
-             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
-             "It is applied to the output.")
-        .AsDispensable();
-    AddInput("Bias",
-             "(Tensor, optional) Bias is a 1-dimensional tensor of size "
-             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
-             "It is applied to the output.")
-        .AsDispensable();
-    AddOutput("Y", "(LoDTensor) Result after normalization.");
-    AddOutput("Mean", "(Tensor) Mean of the current mini batch.")
-        .AsIntermediate();
-    AddOutput("Variance", "(Tensor) Variance of the current mini batch.")
-        .AsIntermediate();
-
-    AddAttr<float>("epsilon",
-                   "(float, default 1e-5) Constant for "
-                   "numerical stability")
-        .SetDefault(1e-5)
-        .AddCustomChecker([](const float &epsilon) {
-          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
-                         "'epsilon' should be between 0.0 and 0.001.");
-        });
-    AddAttr<int>("begin_norm_axis",
-                 "(int default:1), the "
-                 "axis of `begin_norm_axis ... Rank(X) - 1` will be "
-                 "normalized. `begin_norm_axis` splits the tensor(`X`) to a "
-                 "matrix [N,H].")
-        .SetDefault(1)
-        .AddCustomChecker([](const int &begin_norm_axis) {
-          PADDLE_ENFORCE_GT(begin_norm_axis, 0,
-                            "'begin_norm_axis' should be greater than zero.");
-        });
-
-    AddComment(R"DOC(
-Layer Normalization.
-
-Layer Norm has been implemented as discussed in the paper:
-https://arxiv.org/abs/1607.06450
-...
-)DOC");
-  }
-};
-
-template <typename T>
-class LayerNormKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    auto *output = ctx.Output<Tensor>("Y");
-    auto *mean = ctx.Output<Tensor>("Mean");
-    auto *var = ctx.Output<Tensor>("Variance");
-    output->mutable_data<T>(ctx.GetPlace());
-    mean->mutable_data<T>(ctx.GetPlace());
-    var->mutable_data<T>(ctx.GetPlace());
-
-    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-
-    auto input_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
-
-    auto mean_map = EigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
-    auto var_map = EigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
-    auto output_map = EigenMatrixMapRowMajor<T>(output->data<T>(), left, right);
-
-    auto squre = [](T ele) { return ele * ele; };
-    auto add_epslion = [epsilon](T ele) { return ele + epsilon; };
-
-    mean_map = input_map.rowwise().mean();
-    var_map = (input_map - mean_map.replicate(1, right))
-                  .unaryExpr(squre)
-                  .rowwise()
-                  .mean()
-                  .unaryExpr(add_epslion);
-
-    auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
-    // TODO(zcd): Some thinking about output_map, is it appropriate that
-    // `output_map` and `input_map` point to the same memory.
-    auto inv_std = var_map.unaryExpr(inv_std_func);
-    if (scale && bias) {
-      auto scale_map =
-          ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
-      auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
-      output_map = (input_map - mean_map.replicate(1, right))
-                       .cwiseProduct(inv_std.replicate(1, right))
-                       .cwiseProduct(scale_map.replicate(left, 1)) +
-                   bias_map.replicate(left, 1);
-    } else if (scale) {
-      auto scale_map =
-          ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
-      output_map = (input_map - mean_map.replicate(1, right))
-                       .cwiseProduct(inv_std.replicate(1, right))
-                       .cwiseProduct(scale_map.replicate(left, 1));
-    } else if (bias) {
-      auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
-      output_map = (input_map - mean_map.replicate(1, right))
-                       .cwiseProduct(inv_std.replicate(1, right)) +
-                   bias_map.replicate(left, 1);
-    } else {
-      output_map = (input_map - mean_map.replicate(1, right))
-                       .cwiseProduct(inv_std.replicate(1, right));
-    }
-  }
-};
-
-class LayerNormGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    // check input
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Scale"),
-                   "Input(Scale) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Mean"),
-                   "Input(Mean) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Variance"),
-                   "Input(Variance) of LayerNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input(Y@GRAD) of LayerNormOp should not be null.");
-
-    // check output
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
-      ctx->SetOutputDim(framework::GradVarName("Scale"),
-                        ctx->GetInputDim("Scale"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
-      ctx->SetOutputDim(framework::GradVarName("Bias"),
-                        ctx->GetInputDim("Bias"));
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
-    if (var == nullptr) {
-      PADDLE_THROW("can't find Y@GRAD");
-    }
-    const Tensor *t = nullptr;
-    if (var->IsType<Tensor>()) {
-      t = &var->Get<Tensor>();
-    } else if (var->IsType<LoDTensor>()) {
-      t = &var->Get<LoDTensor>();
-    }
-    if (t == nullptr) {
-      PADDLE_THROW("can't find Y@GRAD");
-    }
-    return framework::OpKernelType(framework::ToDataType(t->type()),
-                                   ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class LayerNormGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *mean = ctx.Input<Tensor>("Mean");
-    const auto *var = ctx.Input<Tensor>("Variance");
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-
-    const auto &x_dims = x->dims();
-
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-
-    // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    auto x_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
-    auto d_y_map = ConstEigenMatrixMapRowMajor<T>(d_y->data<T>(), left, right);
-    auto mean_map = ConstEigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
-    auto var_map = ConstEigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
-
-    if (d_bias) {
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      auto d_bias_map = EigenMatrixMapRowMajor<T>(d_bias->data<T>(), 1, right);
-      d_bias_map = d_y_map.colwise().sum();
-    }
-    if (d_scale) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      auto d_scale_map =
-          EigenMatrixMapRowMajor<T>(d_scale->data<T>(), 1, right);
-      auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
-      // There are two equation to compute d_scale. One uses "Y" and the other
-      // does not use "Y"
-      d_scale_map =
-          ((x_map - mean_map.replicate(1, right))
-               .cwiseProduct(
-                   var_map.unaryExpr(inv_std_func).replicate(1, right))
-               .cwiseProduct(d_y_map))
-              .colwise()
-              .sum();
-    }
-
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-      auto d_x_map = EigenMatrixMapRowMajor<T>(d_x->data<T>(), left, right);
-      auto triple_product_func = [](T ele) { return ele * ele * ele; };
-      auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
-      // TODO(zcd): these code can be refined
-      if (d_scale) {
-        auto scale_map =
-            ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
-        // dy_dx
-        auto dx_end = var_map.unaryExpr(inv_std_func)
-                          .replicate(1, right)
-                          .cwiseProduct(d_y_map)
-                          .cwiseProduct(scale_map.replicate(left, 1));
-        // dy_dmean_dx
-        auto dx_mean = (T(-1.0) / right) *
-                       var_map.unaryExpr(inv_std_func)
-                           .replicate(1, right)
-                           .cwiseProduct(d_y_map)
-                           .cwiseProduct(scale_map.replicate(left, 1))
-                           .rowwise()
-                           .sum()
-                           .replicate(1, right);
-        // dy_var_dx
-        auto dvar_end_part = (x_map - mean_map.replicate(1, right))
-                                 .cwiseProduct(scale_map.replicate(left, 1))
-                                 .cwiseProduct(d_y_map)
-                                 .rowwise()
-                                 .sum();
-        auto dvar_end = var_map.unaryExpr(inv_std_func)
-                            .unaryExpr(triple_product_func)
-                            .cwiseProduct(dvar_end_part)
-                            .replicate(1, right);
-        auto dx_var =
-            (T(-1.0) / right) *
-            (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
-
-        d_x_map = dx_end + dx_mean + dx_var;
-      } else {
-        // dy_dx
-        auto dx_end = var_map.unaryExpr(inv_std_func)
-                          .replicate(1, right)
-                          .cwiseProduct(d_y_map);
-        // dy_dmean_dx
-        auto dx_mean = (T(-1.0) / right) *
-                       var_map.unaryExpr(inv_std_func)
-                           .replicate(1, right)
-                           .cwiseProduct(d_y_map)
-                           .rowwise()
-                           .sum()
-                           .replicate(1, right);
-        // dy_var_dx
-        auto dvar_end_part = (x_map - mean_map.replicate(1, right))
-                                 .cwiseProduct(d_y_map)
-                                 .rowwise()
-                                 .sum();
-        auto dvar_end = var_map.unaryExpr(inv_std_func)
-                            .unaryExpr(triple_product_func)
-                            .cwiseProduct(dvar_end_part)
-                            .replicate(1, right);
-        auto dx_var =
-            (T(-1.0) / right) *
-            (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
-
-        d_x_map = dx_end + dx_mean + dx_var;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
-            layer_norm_grad, ops::LayerNormGradOp);
-REGISTER_OP_CPU_KERNEL(
-    layer_norm,
-    ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/layer_norm_op.h b/paddle/operators/layer_norm_op.h
deleted file mode 100644
index bca35b91e6f52d35dee14aac9d080b52914942e3..0000000000000000000000000000000000000000
--- a/paddle/operators/layer_norm_op.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class LayerNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class LayerNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc
deleted file mode 100644
index e24bf622b7f11e61198ab5238f47ba7edff2f4da..0000000000000000000000000000000000000000
--- a/paddle/operators/linear_chain_crf_op.cc
+++ /dev/null
@@ -1,269 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/linear_chain_crf_op.h"
-
-namespace paddle {
-namespace operators {
-
-class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LinearChainCRFOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Emission",
-             "(LoDTensor, default LoDTensor<float>) "
-             "A 2-D LoDTensor with shape [N x D], where N is the size of the "
-             "mini-batch and D is the total tag number. The unscaled emission "
-             "weight matrix for the linear chain CRF. ");
-    AddInput("Transition",
-             "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
-             "[(D + 2) x D]. The learnable parameter for the linear_chain_crf "
-             "operator. See more details in the operator's comments.");
-    AddInput("Label",
-             "(LoDTensor, default LoDTensor<int64_t>) A LoDTensor with shape "
-             "[N x 1], where N is the total element number in a mini-batch. "
-             "The ground truth.");
-    AddOutput(
-        "Alpha",
-        "(Tensor, default Tensor<float>) A 2-D Tensor with shape [N x D]. "
-        "The forward vectors for the entire batch. Denote it as $\alpha$. "
-        "$\alpha$ is a memo table used to calculate the normalization "
-        "factor in CRF. $\alpha[k, v]$ stores the unnormalized "
-        "probabilites of all possible unfinished sequences of tags that end at "
-        "position $k$ with tag $v$. For each $k$, "
-        "$\alpha[k, v]$ is a vector of length $D$ with a component for "
-        "each tag value $v$. This vector is called a forward vecotr and "
-        "will also be used in backward computations.")
-        .AsIntermediate();
-    AddOutput(
-        "EmissionExps",
-        "(Tensor, default Tensor<float>) A 2-D Tensor with shape [N x D]. "
-        "The exponentials of Input(Emission). This is an intermediate "
-        "computational result in forward computation, and will be reused in "
-        "backward computation.")
-        .AsIntermediate();
-    AddOutput(
-        "TransitionExps",
-        "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
-        "[(D + 2) x D]. The exponentials of Input(Transition). This is an "
-        "intermediate computational result in forward computation, and "
-        "will be reused in backward computation.")
-        .AsIntermediate();
-    AddOutput(
-        "LogLikelihood",
-        "(Tensor, default Tensor<float>) The logarithm of the conditional "
-        "likelihood of each training sample in a mini-batch. This is a 2-D "
-        "tensor with shape [S x 1], where S is the sequence number in a "
-        "mini-batch. Note: S is equal to the sequence number in a mini-batch. "
-        "The output is no longer a LoDTensor.");
-    AddComment(R"DOC(
-LinearChainCRF Operator.
-
-Conditional Random Field defines an undirected probabilistic graph with nodes
-denoting random variables and edges denoting dependencies between these
-variables. CRF learns the conditional probability $P(Y|X)$, where
-$X = (x_1, x_2, ... , x_n)$ are structured inputs and
-$Y = (y_1, y_2, ... , y_n)$ are labels for the inputs.
-
-Linear chain CRF is a special case of CRF that is useful for sequence labeling
-task. Sequence labeling tasks do not assume a lot of conditional
-independences among inputs. The only constraint they impose is that the input
-and output must be linear sequences. Thus, the graph of such a CRF is a simple
-chain or a line, which results in the linear chain CRF.
-
-This operator implements the Forward-Backward algorithm for the linear chain
-CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
-http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.
-
-Equation:
-1. Denote Input(Emission) to this operator as $x$ here.
-2. The first D values of Input(Transition) to this operator are for starting
-weights, denoted as $a$ here.
-3. The next D values of Input(Transition) of this operator are for ending
-weights, denoted as $b$ here.
-4. The remaning values of Input(Transition) are for transition weights,
-denoted as $w$ here.
-5. Denote Input(Label) as $s$ here.
-
-The probability of a sequence $s$ of length $L$ is defined as:
-$$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L}
-                + \sum_{l=1}^L x_{s_l}
-                + \sum_{l=2}^L w_{s_{l-1},s_l})$$
-
-where $Z$ is a normalization value so that the sum of $P(s)$ over
-all possible sequences is 1, and $x$ is the emission feature weight
-to the linear chain CRF.
-
-Finally, the linear chain CRF operator outputs the logarithm of the conditional
-likelihood of each training sample in a mini-batch.
-
-NOTE:
-1. The feature function for a CRF is made up of the emission features and the
-transition features. The emission feature weights are NOT computed in
-this operator. They MUST be computed first before this operator is called.
-
-2. Because this operator performs global normalization over all possible
-sequences internally, it expects UNSCALED emission feature weights.
-Please do not call this op with the emission feature being output of any
-nonlinear activation.
-
-3. The 2nd dimension of Input(Emission) MUST be equal to the tag number.
-
-)DOC");
-  }
-};
-
-class LinearChainCRFOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Emission"),
-                   "Input(Emission) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Transition"),
-                   "Input(Transition) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("Alpha"),
-                   "Output(Alpha) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("EmissionExps"),
-                   "Output(EmissionExps) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("TransitionExps"),
-                   "Output(TransitionExps) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("LogLikelihood"),
-                   "Output(LogLikelihood) should be not null.");
-
-    auto emission_dims = ctx->GetInputDim("Emission");
-    PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL,
-                      "The Input(Emission) should be a 2-D tensor.");
-    PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed.");
-
-    auto transition_dims = ctx->GetInputDim("Transition");
-    PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL,
-                      "The Input(Transition) should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(
-        transition_dims[0] - 2, transition_dims[1],
-        "An invalid dimension for the Input(Transition), which should "
-        "be a 2-D tensor with shape [(D + 2) x D].");
-    PADDLE_ENFORCE_EQ(
-        emission_dims[1], transition_dims[1],
-        "The 2nd dimension of the Input(Emission) and the Input(Transition) "
-        "should be equal to the tag number.");
-
-    auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
-                   "The Input(Label) should be a 2-D tensor with the 2nd "
-                   "dimensions fixed to 1.");
-    PADDLE_ENFORCE_EQ(
-        emission_dims[0], label_dims[0],
-        "The height of Input(Emission) and the height of Input(Label) "
-        "should be the same.");
-
-    ctx->SetOutputDim("Alpha", emission_dims);
-    ctx->SetOutputDim("EmissionExps", emission_dims);
-    ctx->SetOutputDim("TransitionExps", transition_dims);
-    // TODO(caoying) This is tricky. The 1st dimension of Output(LogLikelihood)
-    // is the sequence number in a mini-batch. The dimension set here should be
-    // resized to its correct size in the function Compute. Fix this once we can
-    // get LoD information in the InferShape interface.
-    ctx->SetOutputDim("LogLikelihood", {emission_dims[0], 1});
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of linear_chain_crf
-  // is determined by its input "Emission".
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
-        platform::CPUPlace());
-  }
-};
-
-class LinearChainCRFGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("EmissionExps"),
-                   "Input(EmissionExps) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("TransitionExps"),
-                   "Input(TransitionExps) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("LogLikelihood")),
-                   "Input(LogLikelihood@GRAD) shoudl be not null.");
-
-    auto emission_exps_dims = ctx->GetInputDim("EmissionExps");
-    PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2UL,
-                      "The Input(EmissionExps) should be a 2-D tensor.");
-    PADDLE_ENFORCE(emission_exps_dims[0],
-                   "An empty mini-batch is not allowed.");
-
-    auto transition_exps_dims = ctx->GetInputDim("TransitionExps");
-    PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2UL,
-                      "The Input(TransitionExps) should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(
-        transition_exps_dims[0] - 2, transition_exps_dims[1],
-        "An invalid dimension for the Input(TransitionExps), which should "
-        "be a 2-D tensor with shape [(D + 2) x D].");
-    PADDLE_ENFORCE_EQ(
-        emission_exps_dims[1], transition_exps_dims[1],
-        "The 2nd dimension of the Input(EmissionExps) and the "
-        "Input(TransitionExps) should be equal to the tag number.");
-
-    auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
-                   "The Input(Label) should be a 2-D tensor with the 2nd "
-                   "dimensions fixed to 1.");
-    PADDLE_ENFORCE_EQ(
-        emission_exps_dims[0], label_dims[0],
-        "The height of Input(EmissionExps) and the height of Input(Label) "
-        "should be the same.");
-
-    if (ctx->HasOutput(framework::GradVarName("Emission"))) {
-      ctx->SetOutputDim(framework::GradVarName("Emission"), emission_exps_dims);
-    }
-    if (ctx->HasOutput(framework::GradVarName("Transition"))) {
-      ctx->SetOutputDim(framework::GradVarName("Transition"),
-                        transition_exps_dims);
-    }
-  }
-
- protected:
-  // Explicitly set that the data type of output of the linear_chain_crf_grad
-  // operator is determined by its input: gradients of LogLikelihood.
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<LoDTensor>(framework::GradVarName("LogLikelihood"))
-                ->type()),
-        platform::CPUPlace());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(linear_chain_crf, ops::LinearChainCRFOp, ops::LinearChainCRFOpMaker,
-            linear_chain_crf_grad, ops::LinearChainCRFGradOp);
-REGISTER_OP_CPU_KERNEL(
-    linear_chain_crf,
-    ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    linear_chain_crf_grad,
-    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUDeviceContext,
-                                    double>);
diff --git a/paddle/operators/linear_chain_crf_op.cu b/paddle/operators/linear_chain_crf_op.cu
deleted file mode 100644
index da612510b4d45d8eefabe7de303e9fd0132c5f77..0000000000000000000000000000000000000000
--- a/paddle/operators/linear_chain_crf_op.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/linear_chain_crf_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    linear_chain_crf,
-    ops::LinearChainCRFOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LinearChainCRFOpKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    linear_chain_crf_grad,
-    ops::LinearChainCRFGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LinearChainCRFGradOpKernel<paddle::platform::CUDADeviceContext,
-                                    double>);
diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h
deleted file mode 100644
index afc197a1c38091df5bf7d11ef07a4193ad6417cd..0000000000000000000000000000000000000000
--- a/paddle/operators/linear_chain_crf_op.h
+++ /dev/null
@@ -1,353 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-static inline T NormalizeL1(T* x, size_t len) {
-  T sum = 0.;
-  for (size_t i = 0; i < len; ++i) sum += x[i];
-  // (This comment is from the old LinearChainCRFLayer.)
-  // Right now, we just bet that sum won't be zero. If this really happens, we
-  // will figure out what should be done then.
-  PADDLE_ENFORCE(sum,
-                 "The unnormalized probabilities of all possible unfinished "
-                 "sequences must be greater than 0.");
-  T s = 1. / sum;
-  for (size_t i = 0; i < len; ++i) x[i] *= s;
-  return sum;
-}
-
-template <typename T>
-struct ScalarMul {
-  explicit ScalarMul(const T& scalar) : scalar(scalar) {}
-  T operator()(const T& val) const { return val * scalar; }
-
-  T scalar;
-};
-
-using framework::LoDTensor;
-using framework::LoD;
-using framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class LinearChainCRFOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // TODO(caoying) The checks related to LoD information should be
-    // moved into InferShape once after the InferShape is refactored.
-    PADDLE_ENFORCE_EQ(ctx.Input<LoDTensor>("Emission")->NumLevels(), 1UL,
-                      "The Input(Emission) should be a sequence.");
-    PADDLE_ENFORCE_EQ(ctx.Input<LoDTensor>("Label")->NumLevels(), 1UL,
-                      "The Input(Label) should be a sequence.");
-    auto in_lod = ctx.Input<LoDTensor>("Label")->lod();
-    PADDLE_ENFORCE(in_lod.size(), "Input(Label) must be a sequence.");
-    const size_t level = 0;
-    const size_t seq_num = in_lod[level].size() - 1;
-
-    const LoDTensor* emission_weights = ctx.Input<LoDTensor>("Emission");
-    const Tensor* transition_weights = ctx.Input<Tensor>("Transition");
-    const LoDTensor* label = ctx.Input<LoDTensor>("Label");
-
-    Tensor* emission_exps = ctx.Output<Tensor>("EmissionExps");
-    Tensor* transition_exps = ctx.Output<Tensor>("TransitionExps");
-    Tensor* alpha = ctx.Output<Tensor>("Alpha");
-    Tensor* ll = ctx.Output<Tensor>("LogLikelihood");
-
-    // Because the computation codes only runs on CPU, here the memory for all
-    // the outputs is FIXED to be allocated on the CPU memory.
-    emission_exps->mutable_data<T>(platform::CPUPlace());
-    transition_exps->mutable_data<T>(platform::CPUPlace());
-    alpha->mutable_data<T>(platform::CPUPlace());
-
-    // Resize the output tensor to its correct dimension.
-    ll->Resize({static_cast<int>(seq_num), 1});
-    ll->mutable_data<T>(platform::CPUPlace());
-
-    // Now, all the inputs and outputs should be on the CPU memory.
-    auto emission_dims = emission_weights->dims();
-    const size_t batch_size = emission_dims[0];
-    const size_t tag_num = emission_dims[1];
-
-    Tensor emission_row_max;
-    emission_row_max.mutable_data<T>(
-        framework::make_ddim({static_cast<int64_t>(batch_size), 1}),
-        platform::CPUPlace());
-
-    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    auto x = EigenMatrix<T>::From(*emission_weights);
-    auto x_row_max = EigenMatrix<T>::From(emission_row_max);
-    x_row_max.device(place) =
-        x.maximum(Eigen::DSizes<int, 1>(1))
-            .reshape(Eigen::DSizes<int, 2>(int(batch_size), 1));
-
-    auto x_exps = EigenMatrix<T>::From(*emission_exps);
-    x_exps.device(place) =
-        (x - x_row_max.broadcast(Eigen::DSizes<int, 2>(1, tag_num))).exp();
-
-    auto w = EigenMatrix<T>::From(*transition_weights);
-    auto w_exps = EigenMatrix<T>::From(*transition_exps);
-    w_exps.device(place) = w.exp();
-
-    T* log_likelihood = ll->data<T>();
-    for (size_t i = 0; i < seq_num; ++i) {
-      int start_pos = static_cast<int>(in_lod[level][i]);
-      int end_pos = static_cast<int>(in_lod[level][i + 1]);
-      if (end_pos == start_pos) {
-        // If an empty input sequence is given, pad 0 for its cost.
-        log_likelihood[i] = 0.;
-        continue;
-      }
-
-      const Tensor one_seq = emission_weights->Slice(start_pos, end_pos);
-      Tensor one_seq_row_max = emission_row_max.Slice(start_pos, end_pos);
-      Tensor one_seq_exps = emission_exps->Slice(start_pos, end_pos);
-      const Tensor one_seq_label = label->Slice(start_pos, end_pos);
-      Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos);
-
-      log_likelihood[i] = ForwardOneSequence(
-          one_seq, one_seq_row_max, one_seq_exps, *transition_weights,
-          *transition_exps, one_seq_label, &one_seq_alpha);
-    }
-  };
-
- private:
-  T ForwardOneSequence(const Tensor& emission, const Tensor& emission_row_max,
-                       const Tensor& emission_exps, const Tensor& trans_weights,
-                       const Tensor& trans_weight_exps, const Tensor& label,
-                       Tensor* alpha) const {
-    const T* x = emission.data<T>();
-    const T* x_row_max = emission_row_max.data<T>();
-    const T* x_exps = emission_exps.data<T>();
-    const T* w = trans_weights.data<T>();
-    const T* w_exps = trans_weight_exps.data<T>();
-    T* alpha_value = alpha->data<T>();
-
-    auto x_dims = emission.dims();
-    const size_t seq_length = x_dims[0];
-    const size_t tag_num = x_dims[1];
-    // The 1st row of w are transition weights for start mask.
-    // The 2nd row of w are transition weights for end mask.
-    // Transition weights between other tags begin from the 3rd row of w.
-    const size_t state_trans_base_idx = 2;
-
-    for (size_t i = 0; i < tag_num; ++i) {
-      alpha_value[i] = w_exps[i] * x_exps[i];
-    }
-    T ll = -x_row_max[0] - std::log(NormalizeL1<T>(alpha_value, tag_num));
-
-    for (size_t k = 1; k < seq_length; ++k) {
-      for (size_t i = 0; i < tag_num; ++i) {
-        T sum = 0.;
-        for (size_t j = 0; j < tag_num; ++j) {
-          sum += alpha_value[(k - 1) * tag_num + j] *  // (*)
-                 w_exps[(j + state_trans_base_idx) * tag_num + i];
-        }
-        alpha_value[k * tag_num + i] = x_exps[k * tag_num + i] * sum;
-      }
-      // NormalizeL1 is to avoid underflow or overflow at (*).
-      ll -= x_row_max[k] +
-            std::log(NormalizeL1<T>(alpha_value + k * tag_num, tag_num));
-    }
-    T sum = 0.;
-    for (size_t i = 0; i < tag_num; ++i) {
-      sum += alpha_value[(seq_length - 1) * tag_num + i] * w_exps[tag_num + i];
-    }
-    ll -= std::log(sum);
-    // Now ll is equal to -log(Z).
-
-    const int64_t* lbl = label.data<int64_t>();
-    PADDLE_ENFORCE_LT(
-        static_cast<size_t>(*std::max_element(lbl, lbl + seq_length)), tag_num,
-        "An invalid tag label that execesses the largest tag number.");
-
-    // Calculate the nominator part, which depends on the label sequence.
-    ll += w[lbl[0]] /*start transition*/ + x[lbl[0]] +
-          w[tag_num + lbl[seq_length - 1]] /*end transition*/;
-    for (size_t k = 1; k < seq_length; ++k) {
-      ll += x[k * tag_num + lbl[k]] +
-            w[(lbl[k - 1] + state_trans_base_idx) * tag_num + lbl[k]];
-    }
-    return -ll;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const size_t level = 0;  // currently, only support sequence.
-    auto lod = ctx.Input<LoDTensor>("Label")->lod();
-    PADDLE_ENFORCE(lod.size(), "Input(Label) must be a sequence.");
-
-    const Tensor* label = ctx.Input<LoDTensor>("Label");
-    const Tensor* emission_exps = ctx.Input<Tensor>("EmissionExps");
-    const Tensor* transition_exps = ctx.Input<Tensor>("TransitionExps");
-    const Tensor* alpha = ctx.Input<Tensor>("Alpha");
-    const T* ll_grad =
-        ctx.Input<Tensor>(framework::GradVarName("LogLikelihood"))->data<T>();
-
-    Tensor* emission_grad =
-        ctx.Output<Tensor>(framework::GradVarName("Emission"));
-    Tensor* transition_grad =
-        ctx.Output<Tensor>(framework::GradVarName("Transition"));
-
-    // TODO(caoying) Fix this constraint. When the Input(Emission) is from the
-    // data reader operator, it can have no gradients.
-    PADDLE_ENFORCE(emission_grad, "Output(Emission@Grad) should not be null.");
-    emission_grad->mutable_data<T>(platform::CPUPlace());
-    if (transition_grad) {
-      transition_grad->mutable_data<T>(platform::CPUPlace());
-      math::set_constant(ctx.device_context(), transition_grad, 0.);
-    }
-    // Now, all the inputs and outputs should be on the CPU memory.
-
-    auto emission_dims = emission_exps->dims();
-    // Beta is the memo table used in dynamic programming to calculate the
-    // backwark vectors. For a backward vector i (the i-th row of beta), it
-    // captures the unnormalized probabilities of partial sequences starting
-    // at position i.
-    Tensor beta;
-    beta.mutable_data<T>(emission_dims, platform::CPUPlace());
-
-    for (size_t i = 0; i < lod[level].size() - 1; ++i) {
-      int start_pos = static_cast<int>(lod[level][i]);
-      int end_pos = static_cast<int>(lod[level][i + 1]);
-      if (end_pos == start_pos) continue;
-
-      const Tensor one_seq_emission_exps =
-          emission_exps->Slice(start_pos, end_pos);
-      const Tensor one_seq_label = label->Slice(start_pos, end_pos);
-      const Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos);
-      Tensor one_seq_beta = beta.Slice(start_pos, end_pos);
-      Tensor one_seq_emission_grad = emission_grad->Slice(start_pos, end_pos);
-
-      BackwardOneSequence(
-          ctx.template device_context<platform::CPUDeviceContext>(), ll_grad[i],
-          one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label,
-          &one_seq_beta, transition_grad, &one_seq_emission_grad);
-    }
-  };
-
- private:
-  void BackwardOneSequence(const platform::CPUDeviceContext& ctx,
-                           const T ll_grad, const Tensor& emission_exps,
-                           const Tensor& transition_exps, const Tensor& alpha,
-                           const Tensor& label, Tensor* beta,
-                           Tensor* transition_grad,
-                           Tensor* emission_grad) const {
-    const T* w_exps = transition_exps.data<T>();
-    const T* x_exps = emission_exps.data<T>();
-    const int64_t* label_value = label.data<int64_t>();
-    T* beta_value = beta->data<T>();
-
-    auto x_dims = emission_exps.dims();
-    const size_t seq_length = x_dims[0];
-    const size_t tag_num = x_dims[1];
-    const size_t state_trans_base_idx = 2;
-
-    // Calculate the backward vectors: beta.
-    // First, calculate the initialition state.
-    for (size_t i = 0; i < tag_num; ++i) {
-      beta_value[(seq_length - 1) * tag_num + i] = w_exps[tag_num + i];
-    }
-    NormalizeL1<T>(beta_value + (seq_length - 1) * tag_num, tag_num);
-    for (int k = static_cast<int>(seq_length) - 2; k >= 0; --k) {
-      for (size_t i = 0; i < tag_num; ++i) {
-        T sum = 0.;
-        for (size_t j = 0; j < tag_num; ++j) {
-          sum += w_exps[(i + state_trans_base_idx) * tag_num + j] *  // (**)
-                 x_exps[(k + 1) * tag_num + j] *
-                 beta_value[(k + 1) * tag_num + j];
-        }
-        beta_value[k * tag_num + i] = sum;
-      }
-      // NormalizeL1 is to avoid underflow or overflow at (**).
-      NormalizeL1<T>(beta_value + k * tag_num, tag_num);
-    }
-
-    auto x_grad_mat = EigenMatrix<T>::From(*emission_grad);
-    auto alpha_mat = EigenMatrix<T>::From(alpha);
-    auto beta_mat = EigenMatrix<T>::From(*beta);
-
-    auto* place = ctx.eigen_device();
-    auto prob = alpha_mat * beta_mat;
-    auto row_sum = prob.sum(Eigen::DSizes<int, 1>(1))
-                       .reshape(Eigen::DSizes<int, 2>(seq_length, 1))
-                       .broadcast(Eigen::DSizes<int, 2>(1, tag_num));
-    x_grad_mat.device(*place) =
-        (prob / row_sum).unaryExpr(ScalarMul<T>(ll_grad));
-
-    for (size_t k = 0; k < seq_length; ++k) {
-      x_grad_mat(k, label_value[k]) -= static_cast<T>(ll_grad);
-    }
-
-    if (transition_grad) {
-      T* trans_grad = transition_grad->data<T>();
-      for (size_t k = 0; k < tag_num; ++k) {
-        // Do not multiply by the output gradient here, because x_grad_mat has
-        // alrealy done this.
-        trans_grad[k] += x_grad_mat(/*from start state*/ 0, k);
-        trans_grad[tag_num + k] +=
-            x_grad_mat(/*to end state*/ seq_length - 1, k);
-      }
-
-      auto x_exps_mat = EigenMatrix<T>::From(emission_exps);
-
-      // TODO(caoying): Fix this to avoid using this local variable if we can
-      // profile the training process.
-      Tensor tmp;
-      tmp.mutable_data<T>(beta->dims(), platform::CPUPlace());
-      auto tmp_mat = EigenMatrix<T>::From(tmp);
-      auto prob = beta_mat * x_exps_mat;
-      auto row_sum = prob.sum(Eigen::DSizes<int, 1>(1))
-                         .reshape(Eigen::DSizes<int, 2>(seq_length, 1))
-                         .broadcast(Eigen::DSizes<int, 2>(1, tag_num));
-      tmp_mat.device(*place) = prob / row_sum;
-
-      for (size_t k = 1; k < seq_length; ++k) {
-        T sum = 0.;
-        for (size_t i = 0; i < tag_num; ++i) {
-          for (size_t j = 0; j < tag_num; ++j) {
-            sum += w_exps[(i + state_trans_base_idx) * tag_num + j] *  // (**)
-                   alpha_mat(k - 1, i) * tmp_mat(k, j);
-          }
-        }
-        sum = 1. / sum;
-        for (size_t i = 0; i < tag_num; ++i) {
-          for (size_t j = 0; j < tag_num; ++j) {
-            trans_grad[(i + state_trans_base_idx) * tag_num + j] +=
-                sum * w_exps[(i + state_trans_base_idx) * tag_num + j] *
-                alpha_mat(k - 1, i) * tmp_mat(k, j) * ll_grad;
-          }
-        }
-        trans_grad[(label_value[k - 1] + state_trans_base_idx) * tag_num +
-                   label_value[k]] -= static_cast<T>(ll_grad);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/listen_and_serv_op.cc b/paddle/operators/listen_and_serv_op.cc
deleted file mode 100644
index 099f6b23736adcc2a6e9c27dca297178687ae785..0000000000000000000000000000000000000000
--- a/paddle/operators/listen_and_serv_op.cc
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdint.h>
-#include <sys/stat.h>
-#include <ostream>
-#include <thread>
-
-#include <unistd.h>
-
-#include "paddle/framework/executor.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/proto_desc.h"
-#include "paddle/operators/detail/grpc_server.h"
-#include "paddle/operators/detail/sendrecvop_utils.h"
-#include "paddle/operators/detail/simple_block_queue.h"
-#include "paddle/string/printf.h"
-
-namespace paddle {
-namespace operators {
-
-constexpr char kOptimizeBlock[] = "OptimizeBlock";
-
-void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
-  service->RunSyncUpdate();
-  VLOG(4) << "RunServer thread end";
-}
-
-static void CreateTensorFromMessageType(framework::Variable *var,
-                                        sendrecv::VarType var_type) {
-  if (var_type == sendrecv::VarType::LOD_TENSOR) {
-    var->GetMutable<framework::LoDTensor>();
-  } else if (var_type == sendrecv::VarType::SELECTED_ROWS) {
-    var->GetMutable<framework::SelectedRows>();
-  } else {
-    PADDLE_THROW(
-        "VariableMessage type %d is not in "
-        "[LoDTensor, SelectedRows]",
-        var_type);
-  }
-}
-
-class ListenAndServOp : public framework::OperatorBase {
- public:
-  ListenAndServOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {
-    if (!rpc_service_) {
-      std::string endpoint = Attr<std::string>("endpoint");
-      rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
-      server_thread_.reset(new std::thread(RunServer, rpc_service_));
-    }
-  }
-
-  void Stop() override {
-    detail::MessageWithName term_msg;
-    term_msg.first = LISTEN_TERMINATE_MESSAGE;
-    rpc_service_->Push(term_msg);
-    rpc_service_->ShutDown();
-    server_thread_->join();
-  }
-
-  std::string GetGradVarNameForTrainer(const std::string &varname) const {
-    if (grads_counter_.find(varname) == grads_counter_.end()) {
-      grads_counter_[varname] = 0;
-    }
-    return string::Sprintf("%s.trainer_%d", varname, grads_counter_[varname]++);
-  }
-
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-    framework::Scope &recv_scope = scope.NewScope();
-
-    // FIXME(Yancey1989): initialize rpc server with lazy mode.
-    rpc_service_->SetScope(&recv_scope);
-    rpc_service_->SetDevCtx(&dev_ctx);
-    auto param_list = Attr<std::vector<std::string>>("ParamList");
-    auto grad_list = Attr<std::vector<std::string>>("GradList");
-    auto fan_in = Attr<int>("Fanin");
-
-    auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-    auto *program = block->Program();
-    framework::Executor executor(dev_place);
-
-    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
-    bool exit_flag = false;
-    while (!exit_flag) {
-      // Get from multiple trainers, we don't care about the order in which
-      // the gradients arrives, just add suffix 0~n and merge the gradient.
-      rpc_service_->SetCond(0);
-      size_t recv_var_cnt = 0;
-      int batch_barrier = 0;
-      while (batch_barrier != fan_in) {
-        const detail::MessageWithName &v = rpc_service_->Get();
-        auto grad_var_name = v.first;
-        if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
-          LOG(INFO) << "received terminate message and exit";
-          exit_flag = true;
-          break;
-        } else if (grad_var_name == BATCH_BARRIER_MESSAGE) {
-          VLOG(3) << "recv batch barrier message";
-          batch_barrier++;
-          continue;
-        } else {
-          // receive a variable
-          recv_var_cnt++;
-          auto it =
-              std::find(grad_list.begin(), grad_list.end(), grad_var_name);
-          std::string param_var_name;
-          if (it != grad_list.end()) {
-            param_var_name = param_list[it - grad_list.begin()];
-          } else {
-            LOG(ERROR) << "grad has no paired param:" << grad_var_name;
-          }
-          VLOG(3) << "received grad: " << grad_var_name
-                  << " updating param: " << param_var_name;
-
-          if (fan_in > 1) {
-            grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
-          }
-          auto *var = recv_scope.FindVar(grad_var_name);
-          if (var == nullptr) {
-            LOG(ERROR) << "Can not find server side var: " << grad_var_name;
-            PADDLE_THROW("Can not find server side var");
-          }
-          detail::DeserializeFromMessage(v.second, dev_ctx, var);
-        }
-      }
-      VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier.";
-      // TODO(Yancey1989): merge SelectedRows variables here
-      if (exit_flag) {
-        rpc_service_->ShutDown();
-      }
-
-      try {
-        executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
-                     false /*create_local_scope*/, false /*create_vars*/);
-      } catch (std::exception &e) {
-        LOG(ERROR) << "run sub program error " << e.what();
-      }
-      rpc_service_->SetCond(1);
-      rpc_service_->WaitClientGet(recv_var_cnt);
-      grads_counter_.clear();
-    }  // while(true)
-  }
-
- protected:
-  std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
-  std::shared_ptr<std::thread> server_thread_;
-  mutable std::unordered_map<std::string, int> grads_counter_;
-};
-
-class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ListenAndServOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddComment(R"DOC(
-ListenAndServ operator
-
-This operator will start a RPC server which can receive variables
-from send_op and send back variables to recv_op.
-)DOC");
-    AddAttr<std::string>("endpoint",
-                         "(string, default 127.0.0.1:6164)"
-                         "IP address to listen on.")
-        .SetDefault("127.0.0.1:6164")
-        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
-    AddAttr<framework::BlockDesc *>(kOptimizeBlock,
-                                    "BlockID to run on server side.");
-    AddAttr<std::vector<std::string>>(
-        "ParamList", "type list of string",
-        "grad->param name mapping to find which parameters to optimize.")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>(
-        "GradList", "type list of string",
-        "grad->param name mapping to find which parameters to optimize.")
-        .SetDefault({});
-    AddAttr<int>("Fanin", "type int",
-                 "Number of trainers in the current cluster job")
-        .SetDefault(1);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(listen_and_serv, ops::ListenAndServOp,
-                  ops::ListenAndServOpMaker);
diff --git a/paddle/operators/load_combine_op.cc b/paddle/operators/load_combine_op.cc
deleted file mode 100644
index f4be793d7bf1f346c011842c57fb5b5179a697d6..0000000000000000000000000000000000000000
--- a/paddle/operators/load_combine_op.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <fstream>
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-class LoadCombineOp : public framework::OperatorBase {
- public:
-  LoadCombineOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    auto filename = Attr<std::string>("file_path");
-
-    std::ifstream fin(filename);
-    PADDLE_ENFORCE(static_cast<bool>(fin),
-                   "Cannot open file %s for load_combine op", filename);
-
-    auto out_var_names = Outputs("Out");
-    PADDLE_ENFORCE_GT(
-        static_cast<int>(out_var_names.size()), 0,
-        "The number of output variables should be greater than 0.");
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    for (size_t i = 0; i < out_var_names.size(); i++) {
-      auto *out_var = scope.FindVar(out_var_names[i]);
-
-      PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
-                     out_var_names[i]);
-
-      auto *tensor = out_var->GetMutable<framework::LoDTensor>();
-
-      // Error checking
-      PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s",
-                     filename);
-
-      // Get data from fin to tensor
-      DeserializeFromStream(fin, tensor, dev_ctx);
-
-      if (platform::is_gpu_place(place)) {
-        // copy CPU to GPU
-        framework::LoDTensor cpu_tensor;
-        cpu_tensor.ShareDataWith(*tensor);
-        cpu_tensor.set_lod(tensor->lod());
-
-        // reset tensor
-        out_var->Clear();
-        tensor = out_var->GetMutable<framework::LoDTensor>();
-        tensor->set_lod(cpu_tensor.lod());
-        Copy(cpu_tensor, place, dev_ctx, tensor);
-      }
-    }
-  }
-};
-
-class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LoadCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddOutput(
-        "Out",
-        "(vector) The output LoDTensors that will be read from the input file.")
-        .AsDuplicable();
-    AddAttr<std::string>("file_path",
-                         "(string) "
-                         "LoDTensors will be loaded from \"file_path\".")
-        .AddCustomChecker(
-            [](const std::string &path) { return !path.empty(); });
-    AddComment(R"DOC(
-LoadCombine Operator.
-
-LoadCombine operator loads LoDTensor variables from a file. The file should 
-contain one or more LoDTensors serialized using the SaveCombine operator. The 
-LoadCombine operator applies a deserialization strategy to appropriately load 
-the LodTensors, and this strategy complements the serialization strategy used 
-in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled
-with the SaveCombine operator, and can only deserialize one or more LoDTensors 
-that were saved using the SaveCombine operator.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(load_combine, ops::LoadCombineOp,
-                  ops::LoadCombineOpProtoMaker);
diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc
deleted file mode 100644
index f886b423ac7cb89961d1fdb5c6d3776ccafcaf60..0000000000000000000000000000000000000000
--- a/paddle/operators/load_op.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <fstream>
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-class LoadOp : public framework::OperatorBase {
- public:
-  LoadOp(const std::string &type, const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    auto filename = Attr<std::string>("file_path");
-    std::ifstream fin(filename);
-    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
-                   filename);
-
-    auto out_var_name = Output("Out");
-    auto *out_var = scope.FindVar(out_var_name);
-    PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
-                   out_var_name);
-
-    auto *tensor = out_var->GetMutable<framework::LoDTensor>();
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    DeserializeFromStream(fin, tensor, dev_ctx);
-
-    if (platform::is_gpu_place(place)) {
-      // copy CPU to GPU
-      framework::LoDTensor cpu_tensor;
-      cpu_tensor.ShareDataWith(*tensor);
-      cpu_tensor.set_lod(tensor->lod());
-
-      // reset tensor
-      out_var->Clear();
-      tensor = out_var->GetMutable<framework::LoDTensor>();
-      tensor->set_lod(cpu_tensor.lod());
-      Copy(cpu_tensor, place, dev_ctx, tensor);
-    }
-  }
-};
-
-class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddOutput("Out", "(Tensor) The tensor need to be loaded");
-    AddAttr<std::string>("file_path",
-                         "(string) "
-                         "Variable will be loaded from \"file_path\".")
-        .AddCustomChecker(
-            [](const std::string &path) { return !path.empty(); });
-    AddComment(R"DOC(
-Load Operator.
-
-Load operator will load a tensor variable from disk file.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(load, ops::LoadOp, ops::LoadOpProtoMaker);
diff --git a/paddle/operators/lod_array_length_op.cc b/paddle/operators/lod_array_length_op.cc
deleted file mode 100644
index d2c52745cfdf8d0fdb168ef2d90e75a515c31015..0000000000000000000000000000000000000000
--- a/paddle/operators/lod_array_length_op.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class LoDArrayLengthOp : public framework::OperatorBase {
- public:
-  LoDArrayLengthOp(const std::string &type,
-                   const framework::VariableNameMap &inputs,
-                   const framework::VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
-    auto &out =
-        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
-    out.Resize({1});
-    auto cpu = platform::CPUPlace();
-    *out.mutable_data<int64_t>(cpu) = static_cast<int64_t>(x.size());
-  }
-};
-
-class LoDArrayLengthProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LoDArrayLengthProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(LoDTensorArray) The input tensor array.");
-    AddOutput("Out", "(Tensor) 1x1 CPU Tensor of length, int64_t");
-    AddComment(R"DOC(
-LoDArrayLength Operator.
-
-This operator obtains the length of lod tensor array:
-
-$$Out = len(X)$$
-
-NOTE: The output is a CPU Tensor since the control variable should be only in
-CPU and the length of LoDTensorArray should be used as control variables.
-
-)DOC");
-  }
-};
-
-class LoDArrayLengthInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"));
-    PADDLE_ENFORCE(context->HasOutput("Out"));
-    context->SetOutputDim("Out", {1});
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lod_array_length, ops::LoDArrayLengthOp,
-                  ops::LoDArrayLengthInferShape, ops::LoDArrayLengthProtoMaker,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/lod_rank_table_op.cc b/paddle/operators/lod_rank_table_op.cc
deleted file mode 100644
index 692b9bf3710d764eceafda8390eedb8590794ddf..0000000000000000000000000000000000000000
--- a/paddle/operators/lod_rank_table_op.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/op_registry.h"
-namespace paddle {
-namespace operators {
-
-class LoDRankTableOp : public framework::OperatorBase {
- public:
-  LoDRankTableOp(const std::string &type,
-                 const framework::VariableNameMap &inputs,
-                 const framework::VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
-    auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
-    auto *out =
-        scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
-    VLOG(10) << "Level = " << static_cast<size_t>(Attr<int>("level"));
-    out->Reset(x.lod(), static_cast<size_t>(Attr<int>("level")));
-    VLOG(10) << Input("X") << "'s lod information is " << *out;
-  }
-};
-
-class LoDRankTableOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LoDRankTableOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(LoDTensor) input lod tensor, must contain lod information.");
-    AddOutput("Out", "(LoDRankTable) The rank table of specific level.");
-    AddAttr<int>("level", "(int) the specific lod level to rank.")
-        .SetDefault(0)
-        .EqualGreaterThan(0);
-    AddComment(R"DOC(Create LoDRanTable by LoDTensor
-
-LoD Rank Table stores the `level` of `lod` which is ordered by sequence
-length in descending order. It is useful when implement dynamic RNN and is
-shared by dynamic RNN memory, dynamic RNN slice input and dynamic RNN slice
-output operators.
-)DOC");
-  }
-};
-
-class LoDRankTableInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"), "LoDRankTable must has input X");
-  }
-};
-
-class LoDRankTableInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &o : op_desc.Output("Out")) {
-      block->FindRecursiveOrCreateVar(o).SetType(
-          framework::proto::VarDesc::LOD_RANK_TABLE);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(lod_rank_table, paddle::operators::LoDRankTableOp,
-                  paddle::operators::LoDRankTableOpProtoMaker,
-                  paddle::operators::LoDRankTableInferShape,
-                  paddle::operators::LoDRankTableInferVarType,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/lod_reset_op.cc b/paddle/operators/lod_reset_op.cc
deleted file mode 100644
index 3d7b15edcfece84ffea539591a4db2690bd82029..0000000000000000000000000000000000000000
--- a/paddle/operators/lod_reset_op.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/lod_reset_op.h"
-
-namespace paddle {
-namespace operators {
-
-class LoDResetOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    // input check
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of LoDResetOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of LoDResetOp should not be null.");
-    // If target LoD is not set form Input(), then it must be set from Attr().
-    if (!ctx->HasInput("TargetLoD")) {
-      auto level0 = ctx->Attrs().Get<std::vector<int>>("target_lod");
-      PADDLE_ENFORCE(level0.size() > 1,
-                     "Target LoD is not found, should be set to be a valid one "
-                     "through Input() or Attr().");
-    }
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
-  }
-};
-
-class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LoDResetOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(LoDTensor) The input tensor of lod_reset operator.");
-    AddInput("TargetLoD",
-             "(Tensor, optional) The target level 0 LoD from Input().")
-        .AsDispensable();
-    AddOutput("Out", "(LoDTensor) The output tensor of lod_reset operator.");
-    AddAttr<std::vector<int>>("target_lod",
-                              "The target level 0 LoD from Attr().")
-        .SetDefault(std::vector<int>{});
-    AddComment(R"DOC(LoDReset operator
-
-Reset LoD of Input(X) into a new one specified by Input(TargetLoD) or
-Attr(target_lod), or set LoD for Input(X) if it doesn't have one.
-Currently the lod_reset operator only supports the reset of level 0 LoD.
-At least one of Input(TargetLoD) and Attr(target_lod) must be set,
-and if both of them are set, Input(TargetLoD) will be chosen as the
-target LoD.
-
-An example:
-Given a float LoDTensor X with shape (6, 1), its transpose form represents
-
-    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
-
-with LoD = [[0, 2, 5, 6]] and the three (transposed) sequences look like
-
-    [1.0, 2.0], [3.0, 4.0, 5.0], [6.0].
-
-If target LoD = [0, 4, 6], the lod_reset operator will reset the LoD and
-the sequences that the LoDTensor Output(Out) contains becomes:
-
-    [1.0, 2.0, 3.0, 4.0], [5.0, 6.0].
-
-)DOC");
-  }
-};
-
-class LoDResetGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) shouldn't be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, lod_reset_grad,
-            ops::LoDResetGradOp);
-REGISTER_OP_CPU_KERNEL(lod_reset,
-                       ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
-                       ops::LoDResetKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(
-    lod_reset_grad, ops::LoDResetGradKernel<paddle::platform::CPUPlace, float>,
-    ops::LoDResetGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/lod_reset_op.cu b/paddle/operators/lod_reset_op.cu
deleted file mode 100644
index 910866ea6330059f8e0b04e036e3b124e920b5c4..0000000000000000000000000000000000000000
--- a/paddle/operators/lod_reset_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/lod_reset_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    lod_reset, ops::LoDResetKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    lod_reset_grad,
-    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/lod_reset_op.h b/paddle/operators/lod_reset_op.h
deleted file mode 100644
index c1bbba7a83a3d63a94c54390cacbcdf773ab6b98..0000000000000000000000000000000000000000
--- a/paddle/operators/lod_reset_op.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class LoDResetKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
-    auto* in = ctx.Input<framework::LoDTensor>("X");
-    auto* lod_t = ctx.Input<framework::Tensor>("TargetLoD");
-
-    std::vector<int> level0;
-    if (lod_t) {
-      auto* lod = lod_t->data<int>();
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-        framework::Tensor lod_cpu;
-        framework::Copy(*lod_t, platform::CPUPlace(), ctx.device_context(),
-                        &lod_cpu);
-        lod = lod_cpu.data<int>();
-      }
-      level0 = std::vector<int>(lod, lod + lod_t->numel());
-    } else {
-      level0 = ctx.Attr<std::vector<int>>("target_lod");
-    }
-
-    PADDLE_ENFORCE(level0.size() > 1UL,
-                   "The size of target LoD should be greater than 1.");
-    PADDLE_ENFORCE(level0[0] == 0,
-                   "Target LoD should be a vector starting from 0.");
-    PADDLE_ENFORCE(level0.back() == in->dims()[0],
-                   "Target LoD should be a vector end with the "
-                   "first dimension of Input(X).");
-    for (size_t i = 0; i < level0.size() - 1; ++i) {
-      PADDLE_ENFORCE(level0[i + 1] > level0[i],
-                     "Target LoD should be an ascending vector.");
-    }
-
-    out->ShareDataWith(*in);
-    // cast level0 to size_t
-    std::vector<size_t> ulevel0(level0.size(), 0);
-    std::transform(level0.begin(), level0.end(), ulevel0.begin(),
-                   [](int a) { return static_cast<size_t>(a); });
-    framework::LoD target_lod;
-    target_lod.push_back(ulevel0);
-    out->set_lod(target_lod);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LoDResetGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    d_x->ShareDataWith(*d_out);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc
deleted file mode 100644
index 685a807a8acafd36f44161fb17e0e88070d0bf43..0000000000000000000000000000000000000000
--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/detail/safe_ref.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-struct CopyRange {
-  size_t begin;
-  size_t end;
-};
-
-class LoDTensorToArrayOp : public framework::OperatorBase {
- public:
-  LoDTensorToArrayOp(const std::string &type,
-                     const framework::VariableNameMap &inputs,
-                     const framework::VariableNameMap &outputs,
-                     const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s",
-                          Input("X"))
-                  .Get<framework::LoDTensor>();
-    auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")))
-                           .Get<framework::LoDRankTable>();
-    auto &out = *detail::Ref(scope.FindVar(Output("Out")))
-                     .GetMutable<framework::LoDTensorArray>();
-    auto &items = rank_table.items();
-    auto max_seq_len = items[0].length;
-    auto rank_level = rank_table.level();
-
-    PADDLE_ENFORCE_LT(rank_level, x.lod().size(),
-                      "Input should be a LOD tensor, and size is at least %d",
-                      rank_level + 1);
-    out.resize(max_seq_len);
-    std::vector<std::vector<CopyRange>> copy_ranges(max_seq_len);
-
-    // set out[i] lod
-    for (size_t t = 0; t < max_seq_len; t++) {
-      auto &lod = *out[t].mutable_lod();
-      lod.clear();
-      for (auto &item : items) {
-        if (t >= item.length) {
-          break;
-        }
-        size_t start_idx = x.lod()[rank_level][item.index] + t;
-        auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
-            x.lod(), start_idx, start_idx + 1, rank_level + 1);
-        auto &lod_length = lod_and_offset.first;
-        framework::AppendLoD(&lod, lod_length);
-        size_t start_offset = lod_and_offset.second.first;
-        size_t end_offset = lod_and_offset.second.second;
-        copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
-      }
-    }
-    for (size_t i = 0; i < max_seq_len; ++i) {
-      auto &ranges = copy_ranges[i];
-      size_t height = std::accumulate(
-          ranges.begin(), ranges.end(), 0UL,
-          [](size_t a, const CopyRange &b) { return a + b.end - b.begin; });
-      auto x_dim = x.dims();
-      x_dim[0] = static_cast<int64_t>(height);
-      out[i].Resize(x_dim);
-      out[i].mutable_data(x.place(), x.type());
-      size_t offset = 0;
-      for (auto &each_range : ranges) {
-        size_t len = each_range.end - each_range.begin;
-        if (len == 0) {
-          continue;
-        }
-        // out[i][offset: offset+len] = x[each_range.begin: each_range.end]
-        auto slice = out[i].Slice(static_cast<int>(offset),
-                                  static_cast<int>(offset + len));
-
-        platform::DeviceContextPool &pool =
-            platform::DeviceContextPool::Instance();
-        auto &dev_ctx = *pool.Get(place);
-
-        framework::Copy(x.Slice(static_cast<int>(each_range.begin),
-                                static_cast<int>(each_range.end)),
-                        x.place(), dev_ctx, &slice);
-        offset += len;
-      }
-    }
-  }
-};
-
-class LoDTensorToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LoDTensorToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "");
-    AddInput("RankTable", "");
-    AddOutput("Out", "");
-    AddComment("");
-  }
-};
-
-class LoDTensorToArrayInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"),
-                   "Input(X) of LoDTensorToArrayOp should not be null.");
-    PADDLE_ENFORCE(
-        context->HasInput("RankTable"),
-        "Input(RankTable) of LoDTensorToArrayOp should not be null.");
-
-    PADDLE_ENFORCE(context->HasOutput("Out"),
-                   "Output(Out) of LoDTensorToArrayOp should not be null.");
-
-    auto x_dim = context->GetInputDim("X");
-    // The first dim of each LoDTensor in Output can only be set at run-time.;
-    // We still have to Resize each LoDTensor in Output.
-    context->SetOutputDim("Out", x_dim);
-  }
-};
-
-class LoDTensorToArrayInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY);
-    }
-  }
-};
-
-class LoDTensorToArrayGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("array_to_lod_tensor");
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetInput("RankTable", Input("RankTable"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lod_tensor_to_array, ops::LoDTensorToArrayOp,
-                  ops::LoDTensorToArrayOpProtoMaker,
-                  ops::LoDTensorToArrayInferShape,
-                  ops::LoDTensorToArrayInferVarType,
-                  ops::LoDTensorToArrayGradMaker);
diff --git a/paddle/operators/log_loss_op.cc b/paddle/operators/log_loss_op.cc
deleted file mode 100644
index f714945354c5668f58e273dc8d6c7c16d51ac17d..0000000000000000000000000000000000000000
--- a/paddle/operators/log_loss_op.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/log_loss_op.h"
-
-namespace paddle {
-namespace operators {
-
-class LogLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
-                   "Input(Predicted) must be initialized.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
-                   "Input(Labels) must be initialized.");
-
-    auto pred_dims = ctx->GetInputDim("Predicted");
-    auto label_dims = ctx->GetInputDim("Labels");
-
-    PADDLE_ENFORCE_EQ(pred_dims, label_dims);
-    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
-                      "The rank of Input(Predicted) must be 2 and the shape is "
-                      "[batch_size, 1].");
-    PADDLE_ENFORCE_EQ(pred_dims[1], 1,
-                      "Each row of Input(Predicted) contains a real value, "
-                      "so the 2nd dimension of Input(X) must be 1.");
-
-    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
-    ctx->ShareLoD("Predicted", "Loss");
-  }
-};
-
-template <typename AttrType>
-class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LogLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Predicted",
-             "The input value (Predicted) of Log loss op."
-             "Predicted is a 2-D tensor with shape [batch_size, 1].");
-    AddInput("Labels",
-             "The target value (Labels) of Log loss op."
-             "Labels is a 2-D tensor with shape [batch_size, 1].");
-    AddOutput("Loss",
-              "The output tensor with shape [batch_size, 1] "
-              "which represents the log loss.");
-    AddAttr<AttrType>("epsilon", "Epsilon in log loss.");
-    AddComment(R"DOC(
-LogLoss Operator.
-
-Log loss is a loss function used for binary classification. Log Loss quantifies
-the accuracy of a classifier by penalising false classifications. Minimising the
-Log Loss is equivalent to maximising the accuracy of the classifier. We define
-Predicted as the values predicted by our model and Labels as the target ground
-truth value. Log loss can evaluate how close the predicted values are to the
-target. The shapes of Predicted and Labels are both [batch_size, 1].
-The equation is:
-
-$$
-Loss = - Labels * log(Predicted + \epsilon) -
-        (1 - Labels) * log(1 - Predicted + \epsilon)
-$$
-
-)DOC");
-  }
-};
-
-class LogLossGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
-                   "Input(Predicted) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
-                   "Input(Labels) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
-                   "Input(Loss@GRAD) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Predicted")),
-                   "Output(Predicted@GRAD) should not be null.");
-
-    auto pred_dims = ctx->GetInputDim("Predicted");
-    auto label_dims = ctx->GetInputDim("Labels");
-    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
-    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
-
-    auto pred_grad_name = framework::GradVarName("Predicted");
-    ctx->SetOutputDim(pred_grad_name, pred_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
-            ops::LogLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    log_loss_grad,
-    ops::LogLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/log_loss_op.cu b/paddle/operators/log_loss_op.cu
deleted file mode 100644
index be283e470052cc3a569be564ab4baa6bc5b75808..0000000000000000000000000000000000000000
--- a/paddle/operators/log_loss_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/log_loss_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    log_loss, ops::LogLossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    log_loss_grad,
-    ops::LogLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/log_loss_op.h b/paddle/operators/log_loss_op.h
deleted file mode 100644
index 743eddb74004b5e87ed9b8a6ccb1b8496b8548dc..0000000000000000000000000000000000000000
--- a/paddle/operators/log_loss_op.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class LogLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* loss_out = ctx.Output<Tensor>("Loss");
-
-    loss_out->mutable_data<T>(ctx.GetPlace());
-
-    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
-
-    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
-    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
-
-    auto loss = EigenVector<T>::Flatten(*loss_out);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    loss.device(place) = (-(label * (prediction + epsilon).log()) -
-                          ((static_cast<T>(1) - label) *
-                           (static_cast<T>(1) - prediction + epsilon).log()));
-  }
-};
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class LogLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
-
-    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
-    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
-
-    auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-    auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
-
-    auto dl = EigenVector<T>::Flatten(*dloss);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    if (dpred) {
-      dpred->mutable_data<T>(ctx.GetPlace());
-      auto dx = framework::EigenVector<T>::Flatten(*dpred);
-      dx.device(place) = dl * (-(label / (prediction + epsilon)) +
-                               ((static_cast<T>(1) - label) /
-                                (static_cast<T>(1) - prediction + epsilon)));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/logical_op.cc b/paddle/operators/logical_op.cc
deleted file mode 100644
index fedd325cf4f7b5a779d17e0259a16d5cf39b77b7..0000000000000000000000000000000000000000
--- a/paddle/operators/logical_op.cc
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/logical_op.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-template <typename OpComment>
-class BinaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  BinaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    OpComment comment;
-    AddInput("X",
-             string::Sprintf("(LoDTensor) Left hand operand of %s operator",
-                             comment.type));
-    AddInput("Y",
-             string::Sprintf("(LoDTensor) Right hand operand of %s operator",
-                             comment.type));
-    AddOutput("Out", string::Sprintf(
-                         "(LoDTensor) n-dim bool tensor. Each element is %s",
-                         comment.equation));
-    AddComment(string::Sprintf(R"DOC(%s Operator
-
-It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean tensors.
-Each element of Out is calculated by %s
-)DOC",
-                               comment.type, comment.equation));
-  }
-};
-
-template <typename OpComment>
-class UnaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  UnaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    OpComment comment;
-    AddInput("X", string::Sprintf("(LoDTensor) Operand of %s operator",
-                                  comment.type));
-    AddOutput("Out", string::Sprintf(
-                         "(LoDTensor) n-dim bool tensor. Each element is %s",
-                         comment.equation));
-    AddComment(string::Sprintf(R"DOC(%s Operator
-
-It operates element-wise on X, and returns the Out. X and Out are N-dim boolean tensors.
-Each element of Out is calculated by %s
-)DOC",
-                               comment.type, comment.equation));
-  }
-};
-
-template <typename OpComment>
-class BinaryLogicalOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    OpComment comment;
-    PADDLE_ENFORCE(context->HasInput("X"),
-                   "Input(X) of %s operator must not be null", comment.type);
-    PADDLE_ENFORCE(context->HasInput("Y"),
-                   "Input(Y) of %s operator must not be null", comment.type);
-    auto dim_x = context->GetInputDim("X");
-    auto dim_y = context->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(framework::product(dim_x), framework::product(dim_y),
-                      "The number of elements in X and Y should be same");
-
-    context->SetOutputDim("Out", context->GetInputDim("X"));
-    context->ShareLoD("X", "Out");
-  }
-};
-
-template <typename OpComment>
-class UnaryLogicalOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    OpComment comment;
-    PADDLE_ENFORCE(context->HasInput("X"),
-                   "Input(X) of %s operator must not be null", comment.type);
-    auto dim_x = context->GetInputDim("X");
-
-    context->SetOutputDim("Out", context->GetInputDim("X"));
-    context->ShareLoD("X", "Out");
-  }
-};
-
-class LogicalOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
-    // LogicalOp kernel's device type is decided by input tensor place
-    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
-    return kt;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_BINARY_LOGICAL_OP(op_type, _equation)                     \
-  struct _##op_type##Comment {                                             \
-    static char type[];                                                    \
-    static char equation[];                                                \
-  };                                                                       \
-  char _##op_type##Comment::type[]{#op_type};                              \
-  char _##op_type##Comment::equation[]{_equation};                         \
-  REGISTER_OPERATOR(                                                       \
-      op_type, ::paddle::operators::LogicalOp,                             \
-      ::paddle::operators::BinaryLogicalOpProtoMaker<_##op_type##Comment>, \
-      ::paddle::operators::BinaryLogicalOpInferShape<_##op_type##Comment>, \
-      ::paddle::framework::EmptyGradOpMaker);
-
-#define REGISTER_UNARY_LOGICAL_OP(op_type, _equation)                     \
-  struct _##op_type##Comment {                                            \
-    static char type[];                                                   \
-    static char equation[];                                               \
-  };                                                                      \
-  char _##op_type##Comment::type[]{#op_type};                             \
-  char _##op_type##Comment::equation[]{_equation};                        \
-  REGISTER_OPERATOR(                                                      \
-      op_type, ::paddle::operators::LogicalOp,                            \
-      ::paddle::operators::UnaryLogicalOpProtoMaker<_##op_type##Comment>, \
-      ::paddle::operators::UnaryLogicalOpInferShape<_##op_type##Comment>, \
-      ::paddle::framework::EmptyGradOpMaker);
-
-REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$");
-REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU,
-                               paddle::operators::LogicalAndFunctor);
-REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$");
-REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU,
-                               paddle::operators::LogicalOrFunctor);
-REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
-REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
-                              paddle::operators::LogicalNotFunctor);
-REGISTER_BINARY_LOGICAL_OP(logical_xor,
-                           "$$Out = (X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$");
-REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
-                               paddle::operators::LogicalXorFunctor);
diff --git a/paddle/operators/logical_op.cu b/paddle/operators/logical_op.cu
deleted file mode 100644
index 87f2287b8f11aabe8afe87776eff49295c1ea2ac..0000000000000000000000000000000000000000
--- a/paddle/operators/logical_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/logical_op.h"
-
-REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CUDA,
-                               paddle::operators::LogicalAndFunctor);
-REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CUDA,
-                               paddle::operators::LogicalOrFunctor);
-REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CUDA,
-                              paddle::operators::LogicalNotFunctor);
-REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CUDA,
-                               paddle::operators::LogicalXorFunctor);
diff --git a/paddle/operators/logical_op.h b/paddle/operators/logical_op.h
deleted file mode 100644
index 413857685603c7b84e885135d9aadf7cc71a4f72..0000000000000000000000000000000000000000
--- a/paddle/operators/logical_op.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <type_traits>
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct LogicalAndFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a && b; }
-};
-
-template <typename T>
-struct LogicalOrFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a || b; }
-};
-
-template <typename T>
-struct LogicalNotFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a) const { return !a; }
-};
-
-template <typename T>
-struct LogicalXorFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const {
-    return (a || b) && !(a && b);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class BinaryLogicalOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
-    Functor binary_func;
-    platform::Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), x->data<T>(),
-          x->data<T>() + x->numel(), y->data<T>(),
-          out->mutable_data<bool>(context.GetPlace()), binary_func);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class UnaryLogicalOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    Functor unary_func;
-    platform::Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), x->data<T>(),
-          x->data<T>() + x->numel(),
-          out->mutable_data<bool>(context.GetPlace()), unary_func);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \
-  REGISTER_OP_##dev##_KERNEL(                                 \
-      op_type, ::paddle::operators::BinaryLogicalOpKernel<    \
-                   ::paddle::platform::dev##DeviceContext, functor<bool>>);
-
-#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \
-  REGISTER_OP_##dev##_KERNEL(                                \
-      op_type, ::paddle::operators::UnaryLogicalOpKernel<    \
-                   ::paddle::platform::dev##DeviceContext, functor<bool>>);
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
deleted file mode 100644
index 2405852f53d46356a474897d3a111d1c94eed081..0000000000000000000000000000000000000000
--- a/paddle/operators/lookup_table_op.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/lookup_table_op.h"
-#include "paddle/framework/var_type_inference.h"
-
-namespace paddle {
-namespace operators {
-
-class LookupTableOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("W"),
-                   "Input(W) of LookupTableOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Ids"),
-                   "Input(Ids) of LookupTableOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of LookupTableOp should not be null.");
-
-    auto table_dims = ctx->GetInputDim("W");
-    auto ids_dims = ctx->GetInputDim("Ids");
-
-    PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
-    PADDLE_ENFORCE_EQ(ids_dims[1], 1);
-
-    ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]});
-    ctx->ShareLoD("Ids", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
-        ctx.device_context());
-  }
-};
-
-class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LookupTableOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("W",
-             "An input represents embedding tensors, "
-             "which is a learnable parameter.");
-    AddInput("Ids",
-             "An input with type int32 or int64 "
-             "contains the ids to be looked up in W. "
-             "Ids must be a column vector with rank = 2. "
-             "The 2nd dimension size must be 1.");
-    AddOutput("Out", "The lookup results, which have the same type as W.");
-    AddAttr<bool>("is_sparse",
-                  "(boolean, default false) "
-                  "Sparse update")
-        .SetDefault(false);
-    AddAttr<int64_t>("padding_idx",
-                     "(int64, default -1) "
-                     "If the value is -1, it makes no effect to lookup. "
-                     "Otherwise the given value indicates padding the output "
-                     "with zeros whenever lookup encounters it in Ids.")
-        .SetDefault(-1);
-    AddComment(R"DOC(
-Lookup Table Operator.
-
-This operator is used to perform lookups on the parameter W,
-then concatenated into a dense tensor.
-
-The input Ids can carry the LoD (Level of Details) information,
-or not. And the output only shares the LoD information with input Ids.
-
-)DOC");
-  }
-};
-
-class LookupTableOpGradDescMaker
-    : public framework::DefaultGradOpDescMaker<true> {
-  using ::paddle::framework::DefaultGradOpDescMaker<
-      true>::DefaultGradOpDescMaker;
-
- protected:
-  virtual std::string GradOpType() const { return "lookup_table_grad"; }
-};
-
-class LookupTableOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto table_dims = ctx->GetInputDim("W");
-    ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
-        ctx.device_context());
-  }
-};
-
-class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
-    auto attr = op_desc.GetAttr("is_sparse");
-    bool is_sparse = boost::get<bool>(attr);
-    if (is_sparse) {
-      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
-              << " is set to SelectedRows";
-      block->Var(out_var_name)
-          ->SetType(framework::proto::VarDesc::SELECTED_ROWS);
-    } else {
-      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
-              << " is set to LoDTensor";
-      block->Var(out_var_name)->SetType(framework::proto::VarDesc::LOD_TENSOR);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lookup_table, ops::LookupTableOp,
-                  ops::LookupTableOpGradDescMaker, ops::LookupTableOpMaker);
-REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad,
-                  ops::LookupTableOpGradVarTypeInference);
-
-REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel<float>,
-                       ops::LookupTableKernel<double>);
-REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel<float>,
-                       ops::LookupTableGradKernel<double>);
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
deleted file mode 100644
index 07372808bbf078bd2e9b0bb5782b95a046253f46..0000000000000000000000000000000000000000
--- a/paddle/operators/lookup_table_op.cu
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/lookup_table_op.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/cuda_helper.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, int BlockDimX, int BlockDimY, int GridDimX,
-          bool PaddingFlag>
-__global__ void LookupTable(T* output, const T* table, const int64_t* ids,
-                            const int64_t N, const int64_t K, const int64_t D,
-                            const int64_t padding_idx) {
-  int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * GridDimX;
-
-  while (idy < K) {
-    int64_t id = ids[idy];
-    PADDLE_ASSERT(id >= 0);
-    PADDLE_ASSERT(id < N);
-    T* out = output + idy * D;
-    const T* tab = table + id * D;
-    for (int i = idx; i < D; i += BlockDimX) {
-      if (PaddingFlag) {
-        if (id == padding_idx)
-          out[i] = static_cast<T>(0);
-        else
-          out[i] = tab[i];
-      } else {
-        out[i] = tab[i];
-      }
-    }
-    idy += BlockDimY * GridDimX;
-  }
-}
-
-template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
-__global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
-                                const int64_t N, const int64_t K,
-                                const int64_t D) {
-  int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * GridDimX;
-
-  while (idy < K) {
-    int id = ids[idy];
-    PADDLE_ASSERT(id >= 0);
-    PADDLE_ASSERT(id < N);
-    const T* out = output + idy * D;
-    T* tab = table + id * D;
-    for (int i = idx; i < D; i += BlockDimX) {
-      paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
-    }
-    idy += BlockDimY * GridDimX;
-  }
-}
-
-template <typename T>
-class LookupTableCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* table_t = context.Input<LoDTensor>("W");
-    auto* ids_t = context.Input<LoDTensor>("Ids");
-    auto* output_t = context.Output<LoDTensor>("Out");
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-
-    size_t N = table_t->dims()[0];
-    size_t D = table_t->dims()[1];
-    size_t K = ids_t->numel();
-    auto* ids = ids_t->data<int64_t>();
-    auto* table = table_t->data<T>();
-    auto* output = output_t->mutable_data<T>(context.GetPlace());
-
-    dim3 threads(128, 8);
-    dim3 grids(8, 1);
-
-    if (padding_idx == -1)
-      LookupTable<
-          T, 128, 8, 8,
-          false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
-    else
-      LookupTable<
-          T, 128, 8, 8,
-          true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
-  }
-};
-
-template <typename T>
-class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    bool is_sparse = context.Attr<bool>("is_sparse");
-    // Since paddings are not trainable and fixed in forward, the gradient of
-    // paddings makes no sense and we don't deal with it in backward.
-    if (is_sparse) {
-      auto* ids = context.Input<LoDTensor>("Ids");
-      auto* table = context.Input<LoDTensor>("W");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
-
-      auto* ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();
-
-      auto stream = dev_ctx.stream();
-      // copy GPU memory to CPU pinned memory
-      framework::Vector<int64_t> new_rows;
-      new_rows.resize(ids_dim[0]);
-      auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
-
-      memory::Copy(platform::CPUPlace(), new_rows.cuda_data(), gpu_place,
-                   ids_data, ids_dim[0] * sizeof(int64_t), stream);
-
-      d_table->set_rows(new_rows);
-
-      auto* d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table->dims()[1]});
-      d_table_value->mutable_data<T>(context.GetPlace());
-
-      auto* d_table_data = d_table_value->data<T>();
-      auto* d_output_data = d_output->data<T>();
-      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
-      memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data,
-                   d_output->numel() * sizeof(T), stream);
-
-    } else {
-      auto ids_t = context.Input<LoDTensor>("Ids");
-      auto d_output_t = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto d_table_t = context.Output<LoDTensor>(framework::GradVarName("W"));
-
-      int N = d_table_t->dims()[0];
-      int D = d_table_t->dims()[1];
-      int K = ids_t->numel();
-      const int64_t* ids = ids_t->data<int64_t>();
-      const T* d_output = d_output_t->data<T>();
-      T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
-
-      auto t = framework::EigenVector<T>::Flatten(*d_table_t);
-      t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
-
-      dim3 threads(128, 8);
-      dim3 grids(8, 1);
-      LookupTableGrad<T, 128, 8, 8><<<grids, threads, 0, dev_ctx.stream()>>>(
-          d_table, d_output, ids, N, K, D);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>,
-                        ops::LookupTableCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(lookup_table_grad,
-                        ops::LookupTableGradCUDAKernel<float>,
-                        ops::LookupTableGradCUDAKernel<double>);
diff --git a/paddle/operators/lookup_table_op.h b/paddle/operators/lookup_table_op.h
deleted file mode 100644
index 0842c422f7bfd3cad9b36dfdbab930f3cc4a8728..0000000000000000000000000000000000000000
--- a/paddle/operators/lookup_table_op.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/selected_rows.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-
-template <typename T>
-class LookupTableKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* table_t = context.Input<LoDTensor>("W");      // float tensor
-    auto* ids_t = context.Input<LoDTensor>("Ids");      // int tensor
-    auto* output_t = context.Output<LoDTensor>("Out");  // float tensor
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-
-    int N = table_t->dims()[0];
-    int D = table_t->dims()[1];
-    auto* ids = ids_t->data<int64_t>();
-    auto* table = table_t->data<T>();
-    auto* output = output_t->mutable_data<T>(context.GetPlace());
-
-    if (padding_idx == -1) {
-      for (int64_t i = 0; i < ids_t->numel(); ++i) {
-        PADDLE_ENFORCE_LT(ids[i], N);
-        PADDLE_ENFORCE_GE(ids[i], 0);
-        memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
-      }
-    } else {
-      for (int64_t i = 0; i < ids_t->numel(); ++i) {
-        if (ids[i] == padding_idx) {
-          memset(output + i * D, 0, D * sizeof(T));
-        } else {
-          PADDLE_ENFORCE_LT(ids[i], N);
-          PADDLE_ENFORCE_GE(ids[i], 0);
-          memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class LookupTableGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool is_sparse = context.Attr<bool>("is_sparse");
-    // Since paddings are not trainable and fixed in forward, the gradient of
-    // paddings makes no sense and we don't deal with it in backward.
-    if (is_sparse) {
-      auto* ids = context.Input<LoDTensor>("Ids");
-      auto* table = context.Input<LoDTensor>("W");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
-
-      auto* ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();
-
-      framework::Vector<int64_t> new_rows;
-      new_rows.reserve(ids_dim[0]);
-      for (int64_t i = 0; i < ids_dim[0]; i++) {
-        new_rows.push_back(ids_data[i]);
-      }
-      d_table->set_rows(new_rows);
-
-      auto* d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table->dims()[1]});
-      d_table_value->mutable_data<T>(context.GetPlace());
-
-      d_table->set_height(table->dims()[0]);
-
-      auto* d_output_data = d_output->data<T>();
-      auto* d_table_data = d_table_value->data<T>();
-
-      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
-      memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
-    } else {
-      auto* ids = context.Input<LoDTensor>("Ids");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
-      auto* table = context.Input<LoDTensor>("W");
-
-      auto* ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();
-
-      int N = table->dims()[0];
-      int D = d_output->dims()[1];
-
-      auto* d_output_data = d_output->data<T>();
-      auto* d_table_data = d_table->mutable_data<T>(context.GetPlace());
-
-      memset(d_table_data, 0, d_table->numel() * sizeof(T));
-
-      for (int64_t i = 0; i < ids->numel(); ++i) {
-        PADDLE_ENFORCE_LT(ids_data[i], N);
-        PADDLE_ENFORCE_GE(ids_data[i], 0);
-        for (int j = 0; j < D; ++j) {
-          d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j];
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/lrn_op.cc b/paddle/operators/lrn_op.cc
deleted file mode 100644
index 95673ba19e776b3c52eb492d0b14d761b584f807..0000000000000000000000000000000000000000
--- a/paddle/operators/lrn_op.cc
+++ /dev/null
@@ -1,236 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/lrn_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-template <typename T>
-struct LRNFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& input, framework::Tensor* out,
-                  framework::Tensor* mid, int N, int C, int H, int W, int n,
-                  T k, T alpha, T beta) {
-    auto x_v = framework::EigenVector<T>::Flatten(input);
-
-    const int start = -(n - 1) / 2;
-    const int end = start + n;
-
-    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
-    e_mid = e_mid.constant(k);
-
-    auto e_x = framework::EigenTensor<T, 4>::From(input);
-    for (int m = 0; m < N; m++) {
-      for (int i = 0; i < C; i++) {
-        for (int c = start; c <= end; c++) {
-          int ch = i + c;
-          if (ch >= 0 && ch < C) {
-            auto s = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                 Eigen::array<int, 4>({{1, 1, H, W}}));
-
-            auto r = e_x.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                               Eigen::array<int, 4>({{1, 1, H, W}}));
-
-            s += alpha * r.square();
-          }
-        }
-      }
-    }
-
-    auto out_e = framework::EigenVector<T>::Flatten(*out);
-    out_e = x_v * e_mid.reshape(Eigen::DSizes<int, 1>(e_mid.size())).pow(-beta);
-  }
-};
-template struct LRNFunctor<platform::CPUDeviceContext, float>;
-template struct LRNFunctor<platform::CPUDeviceContext, double>;
-
-template <typename T>
-struct LRNGradFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& x, const framework::Tensor& out,
-                  const framework::Tensor& mid, framework::Tensor* x_g,
-                  const framework::Tensor& out_g, int N, int C, int H, int W,
-                  int n, T alpha, T beta) {
-    T ratio = -2 * alpha * beta;
-    auto x_g_e = framework::EigenVector<T>::Flatten(*x_g);
-    x_g_e = x_g_e.constant(0.0);
-
-    auto e_x = framework::EigenTensor<T, 4>::From(x);
-    auto e_x_g = framework::EigenTensor<T, 4>::From(*x_g);
-    auto e_out = framework::EigenTensor<T, 4>::From(out);
-    auto e_out_g = framework::EigenTensor<T, 4>::From(out_g);
-    auto e_mid = framework::EigenTensor<T, 4>::From(mid);
-
-    const int start = -(n - 1) / 2;
-    const int end = start + n;
-    for (int m = 0; m < N; m++) {
-      for (int i = 0; i < C; i++) {
-        auto i_x = e_x.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                             Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        auto i_x_g = e_x_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                 Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        auto i_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                     Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        auto i_mid = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                 Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        i_x_g = i_mid.pow(-beta) * i_out_g;
-        for (int c = start; c <= end; c++) {
-          int ch = i + c;
-          if (ch < 0 || ch >= C) {
-            continue;
-          }
-
-          auto c_out = e_out.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                                   Eigen::array<int, 4>({{1, 1, H, W}}));
-
-          auto c_mid = e_mid.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                                   Eigen::array<int, 4>({{1, 1, H, W}}));
-
-          auto c_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                                       Eigen::array<int, 4>({{1, 1, H, W}}));
-
-          i_x_g += ratio * c_out_g * c_out * i_x / c_mid;
-        }
-      }
-    }
-  }
-};
-template struct LRNGradFunctor<platform::CPUDeviceContext, float>;
-template struct LRNGradFunctor<platform::CPUDeviceContext, double>;
-
-class LRNOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LRNOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of LRNOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MidOut"),
-                   "MidOut(Out) of LRNOp should not be null.");
-
-    auto x_dim = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4.");
-
-    ctx->SetOutputDim("Out", x_dim);
-    ctx->SetOutputDim("MidOut", x_dim);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-template <typename T>
-class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LRNOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(Tensor) The input of LRN operator. "
-             "It must be a 4D tenor with NCHW format.");
-    AddOutput("Out",
-              "(Tensor) The output of LRN operator, which is also the 4D "
-              "tensor with NCHW format.");
-    AddOutput("MidOut",
-              "(Tensor) Middle result of LRN operator. It's computed in "
-              "forward process and also used in backward process.");
-
-    AddAttr<int>("n",
-                 "(int default 5) "
-                 "n is the \"adjacent\" kernel that maps "
-                 "at the same spatial position.")
-        .SetDefault(5)
-        .GreaterThan(0);
-
-    AddAttr<T>("k",
-               "(float, default 2.0) "
-               "k is the bias.")
-        .SetDefault(2.0)
-        .GreaterThan(0.0);
-
-    AddAttr<T>("alpha",
-               "(float, default 0.0001) "
-               "alpha is the scale number.")
-        .SetDefault(0.0001)
-        .GreaterThan(0.0);
-
-    AddAttr<T>("beta",
-               "(float, default 0.75) "
-               "beta is the power number.")
-        .SetDefault(0.75)
-        .GreaterThan(0.0);
-
-    AddComment(R"DOC(
-Local Response Normalization Operator.
-
-This operator comes from the paper:
-<<ImageNet Classification with Deep Convolutional Neural Networks>>.
-
-The original formula is:
-
-$$
-Output(i, x, y) = Input(i, x, y) / \left(
-k + \alpha \sum\limits^{\min(C, c + n/2)}_{j = \max(0, c - n/2)}
-(Input(j, x, y))^2
-\right)^{\beta}
-$$
-
-Function implementation:
-
-Inputs and outpus are in NCHW format, while input.shape.ndims() equals 4.
-And dimensions 0 ~ 3 represent batch size, feature maps, rows,
-and columns, respectively.
-
-Input and Output in the formula above is for each map(i) of one image, and
-Input(i, x, y), Output(i, x, y) represents an element in an image.
-
-C is the number of feature maps of one image. n is a hyper-parameter
-configured when operator is initialized. The sum in the denominator
-is the sum of the same positions in the neighboring maps.
-
-)DOC");
-  }
-};
-
-class LRNOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("MidOut"), "Input(MidOut) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-
-    auto x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(lrn, ops::LRNOp, ops::LRNOpMaker<float>, lrn_grad, ops::LRNOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    lrn, ops::LRNKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    lrn_grad, ops::LRNGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/lrn_op.cu b/paddle/operators/lrn_op.cu
deleted file mode 100644
index eb9d66a73dfe3e22f1151d73ce5e34f2eda0835e..0000000000000000000000000000000000000000
--- a/paddle/operators/lrn_op.cu
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/lrn_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void KeCMRNormFillScale(int img_size, const T* in, T* mid, int C,
-                                   int H, int W, int size, T k, T alpha) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < img_size) {
-    const int w = idx % W;
-    const int h = (idx / W) % H;
-    const int n = idx / W / H;
-    const int offset = (n * C * H + h) * W + w;
-
-    in += offset;
-    mid += offset;
-    const int step = H * W;
-    const int pre_pad = (size - 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-
-    T accum = 0;
-    int index = 0;
-    while (index < C + post_pad) {
-      if (index < C) {
-        T val = in[index * step];
-        accum += val * val;
-      }
-      if (index >= size) {
-        T val = in[(index - size) * step];
-        accum -= val * val;
-      }
-      if (index >= post_pad) {
-        mid[(index - post_pad) * step] = k + accum * alpha;
-      }
-      ++index;
-    }
-  }
-}
-
-template <typename T>
-__global__ void KeCMRNormOutput(int input_size, const T* in, const T* mid,
-                                T negative_beta, T* out) {
-  const int index = threadIdx.x + blockIdx.x * blockDim.x;
-  if (index < input_size) {
-    out[index] = in[index] * pow(mid[index], negative_beta);
-  }
-}
-
-template <typename T>
-void CrossMapNormal(const framework::ExecutionContext& ctx, const T* inputs,
-                    T* outputs, T* mid, int N, int C, int H, int W, int n, T k,
-                    T alpha, T beta) {
-  int img_size = N * H * W;
-  const int block_size = 1024;
-  int grid_size = (img_size + block_size - 1) / block_size;
-
-  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  KeCMRNormFillScale<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-      img_size, inputs, mid, C, H, W, n, k, alpha);
-
-  int input_size = N * H * W * C;
-  grid_size = (input_size + block_size - 1) / block_size;
-  KeCMRNormOutput<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-      input_size, inputs, mid, -beta, outputs);
-}
-
-template <typename T>
-struct LRNFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& input, framework::Tensor* out,
-                  framework::Tensor* mid, int N, int C, int H, int W, int n,
-                  T k, T alpha, T beta) {
-    CrossMapNormal<T>(
-        ctx, input.data<T>(), out->mutable_data<T>(ctx.GetPlace()),
-        mid->mutable_data<T>(ctx.GetPlace()), N, C, H, W, n, k, alpha, beta);
-  }
-};
-
-template struct LRNFunctor<platform::CUDADeviceContext, float>;
-template struct LRNFunctor<platform::CUDADeviceContext, double>;
-
-template <typename T>
-__global__ void KeCMRNormDiff(int img_size, const T* x, const T* out,
-                              const T* mid, T* x_g, const T* out_g, int C,
-                              int H, int W, int size, T negative_beta,
-                              T ratio) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < img_size) {
-    const int w = idx % W;
-    const int h = (idx / W) % H;
-    const int n = idx / W / H;
-    const int offset = (n * C * H + h) * W + w;
-    x += offset;
-    out += offset;
-    mid += offset;
-    out_g += offset;
-    x_g += offset;
-
-    const int step = H * W;
-    const int pre_pad = size - (size + 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-
-    int index = 0;
-    T accum = 0;
-    // TODO(gongwb): optimize this with thread shared array.
-    while (index < C + post_pad) {
-      if (index < C) {
-        x_g[index * step] = 0.0;
-        accum += out_g[index * step] * out[index * step] / mid[index * step];
-      }
-      if (index >= size) {
-        accum -= out_g[(index - size) * step] * out[(index - size) * step] /
-                 mid[(index - size) * step];
-      }
-      if (index >= post_pad) {
-        x_g[(index - post_pad) * step] +=
-            out_g[(index - post_pad) * step] *
-                pow(mid[(index - post_pad) * step], negative_beta) -
-            ratio * x[(index - post_pad) * step] * accum;
-      }
-      ++index;
-    }
-  }
-}
-
-template <typename T>
-void CrossMapNormalGrad(const framework::ExecutionContext& ctx, const T* x,
-                        const T* out, const T* mid, T* x_g, const T* out_g,
-                        int N, int C, int H, int W, int n, T alpha, T beta) {
-  int img_size = N * H * W;
-
-  const int block_size = 1024;
-  int grid_size = (img_size + block_size - 1) / block_size;
-
-  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  KeCMRNormDiff<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-      img_size, x, out, mid, x_g, out_g, C, H, W, n, -beta,
-      2.0f * alpha * beta);
-}
-
-template <typename T>
-struct LRNGradFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& x, const framework::Tensor& out,
-                  const framework::Tensor& mid, framework::Tensor* x_g,
-                  const framework::Tensor& out_g, int N, int C, int H, int W,
-                  int n, T alpha, T beta) {
-    CrossMapNormalGrad<T>(ctx, x.data<T>(), out.data<T>(), mid.data<T>(),
-                          x_g->mutable_data<T>(ctx.GetPlace()), out_g.data<T>(),
-                          N, C, H, W, n, alpha, beta);
-  }
-};
-
-template struct LRNGradFunctor<platform::CUDADeviceContext, float>;
-template struct LRNGradFunctor<platform::CUDADeviceContext, double>;
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    lrn, ops::LRNKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    lrn_grad, ops::LRNGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/lrn_op.h b/paddle/operators/lrn_op.h
deleted file mode 100644
index ef3a2883a88ff321fb9a87ddaf31123a3b9ee90a..0000000000000000000000000000000000000000
--- a/paddle/operators/lrn_op.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename place, typename T>
-struct LRNFunctor {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& input, framework::Tensor* out,
-                  framework::Tensor* mid, int N, int C, int H, int W, int n,
-                  T k, T alpha, T beta);
-};
-
-template <typename DeviceContext, typename T>
-class LRNKernel : public framework::OpKernel<T> {
- public:
-  using Tensor = framework::Tensor;
-
-  // f(x) = x * ( k + alpha * SUM((x)^2) )^(-beta)
-  // x represents inputs
-  // f(x) represents outputs
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // input
-    const Tensor& x = *ctx.Input<Tensor>("X");
-    auto x_dims = x.dims();
-
-    // NCHW
-    int N = x_dims[0];
-    int C = x_dims[1];
-    int H = x_dims[2];
-    int W = x_dims[3];
-
-    Tensor* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    // MidOut save the intermediate result for backward
-    Tensor* mid = ctx.Output<Tensor>("MidOut");
-    mid->mutable_data<T>(ctx.GetPlace());
-
-    int n = ctx.Attr<int>("n");
-    T alpha = ctx.Attr<float>("alpha");
-    T beta = ctx.Attr<float>("beta");
-    T k = ctx.Attr<float>("k");
-
-    PADDLE_ENFORCE(n > 0, "n should >= 0");
-    PADDLE_ENFORCE(alpha >= 0.0, "alpha should >= 0.0");
-    PADDLE_ENFORCE(beta >= 0.0, "beta should >= 0.0");
-    PADDLE_ENFORCE(k >= 0.0, "k should >= 0.0");
-
-    LRNFunctor<DeviceContext, T> f;
-    f(ctx, x, out, mid, N, C, H, W, n, k, alpha, beta);
-  }
-};
-
-template <typename DeviceContext, typename T>
-struct LRNGradFunctor {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& x, const framework::Tensor& out,
-                  const framework::Tensor& mid, framework::Tensor* x_g,
-                  const framework::Tensor& out_g, int N, int C, int H, int W,
-                  int n, T alpha, T beta);
-};
-
-/**
- * \brief Backward calculation for normalization with across maps.
- *
- * Function implementation:
- *
- * The implementation of this Function is derived from the
- * CrossMapNormalFunc implementation.
- *
- * InputGrad = OutputGrad * MidOut ^ (-beta)
- *    -- upper
- *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / MidOut) * InputValue
- *    -- lower
- *
- * The data of inputs/outputs format is the same as the forward interface
- * and is NCHW.
- *
- * The upper and lower is the same as forward. The logic of the sum
- * is also the same as forward.
- */
-template <typename DeviceContext, typename T>
-class LRNGradKernel : public framework::OpKernel<T> {
- public:
-  using Tensor = framework::Tensor;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor& x = *ctx.Input<Tensor>("X");
-    const Tensor& out = *ctx.Input<Tensor>("Out");
-    const Tensor& out_g = *ctx.Input<Tensor>(framework::GradVarName("Out"));
-    const Tensor& mid = *ctx.Input<Tensor>("MidOut");
-
-    auto x_g = ctx.Output<Tensor>(framework::GradVarName("X"));
-    x_g->mutable_data<T>(ctx.GetPlace());
-
-    auto x_dims = x.dims();
-    int N = x_dims[0];
-    int C = x_dims[1];
-    int H = x_dims[2];
-    int W = x_dims[3];
-
-    int n = ctx.Attr<int>("n");
-    T alpha = ctx.Attr<T>("alpha");
-    T beta = ctx.Attr<T>("beta");
-
-    LRNGradFunctor<DeviceContext, T> f;
-    f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc
deleted file mode 100644
index afb095a04e73c2f09b828c01630ef2347ff49613..0000000000000000000000000000000000000000
--- a/paddle/operators/lstm_op.cc
+++ /dev/null
@@ -1,281 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/lstm_op.h"
-
-namespace paddle {
-namespace operators {
-
-class LSTMOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(Weight) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Bias"),
-                   "Input(Bias) of LSTM should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
-                   "Output(Hidden) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Cell"),
-                   "Output(Cell) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
-                   "Output(BatchGate) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
-                   "Output(BatchGate) of LSTM should not be null.");
-
-    auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2, "Input(X)'s rank must be 2.");
-
-    if (ctx->HasInput("H0")) {
-      PADDLE_ENFORCE(ctx->HasInput("C0"),
-                     "Input(Cell) and Input(Hidden) of LSTM should not "
-                     "be null at the same time.");
-      auto h_dims = ctx->GetInputDim("H0");
-      auto c_dims = ctx->GetInputDim("C0");
-      PADDLE_ENFORCE(h_dims == c_dims,
-                     "The dimension of Input(H0) and Input(C0) "
-                     "should be the same.");
-    }
-
-    int frame_size = in_dims[1] / 4;
-    auto w_dims = ctx->GetInputDim("Weight");
-    PADDLE_ENFORCE_EQ(w_dims.size(), 2,
-                      "The rank of Input(Weight) should be 2.");
-    PADDLE_ENFORCE_EQ(w_dims[0], frame_size,
-                      "The first dimension of Input(Weight) "
-                      "should be %d.",
-                      frame_size);
-    PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size,
-                      "The second dimension of Input(Weight) "
-                      "should be 4 * %d.",
-                      frame_size);
-
-    auto b_dims = ctx->GetInputDim("Bias");
-    PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
-    PADDLE_ENFORCE_EQ(b_dims[0], 1,
-                      "The first dimension of Input(Bias) should be 1.");
-
-    if (ctx->Attrs().Get<bool>("use_peepholes")) {
-      PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
-                        "The second dimension of Input(Bias) should be "
-                        "7 * %d if enable peepholes connection",
-                        frame_size);
-    } else {
-      PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
-                        "The second dimension of Input(Bias) should be "
-                        "4 * %d if disable peepholes connection",
-                        frame_size);
-    }
-
-    framework::DDim out_dims({in_dims[0], frame_size});
-    ctx->SetOutputDim("Hidden", out_dims);
-    ctx->SetOutputDim("Cell", out_dims);
-    ctx->SetOutputDim("BatchGate", in_dims);
-    ctx->SetOutputDim("BatchCellPreAct", out_dims);
-    ctx->ShareLoD("Input", "Hidden");
-    ctx->ShareLoD("Input", "Cell");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
-        ctx.device_context());
-  }
-};
-
-class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LSTMOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Input",
-             "(LoDTensor) the first input is a LodTensor, which support "
-             "variable-time length input sequence. The underlying tensor in "
-             "this LoDTensor is a matrix with shape (T X 4D), where T is the "
-             "total time steps in this mini-batch, D is the hidden size.");
-    AddInput("H0",
-             "(Tensor, optional) the initial hidden state is an optional "
-             "input. This is a tensor with shape (N x D), where N is the "
-             "batch size and D is the hidden size.")
-        .AsDispensable();
-    AddInput("C0",
-             "(Tensor, optional) the initial cell state is an optional "
-             "input. This is a tensor with shape (N x D), where N is the "
-             "batch size. `H0` and `C0` can be NULL but only at the same time.")
-        .AsDispensable();
-    AddInput("Weight",
-             "(Tensor) the learnable hidden-hidden weights."
-             " - The shape is (D x 4D), where D is the hidden size. "
-             " - Weight = {W_ch, W_ih, W_fh, W_oh}");
-    AddInput("Bias",
-             "(Tensor) the learnable weights, which contains two parts: "
-             "input-hidden bias weight and peephole connections weight if "
-             "setting `use_peepholes` True. "
-             "1. `use_peepholes = False` "
-             " - The shape is (1 x 4D). "
-             " - Bias = {b_c, b_i, b_f, b_o}."
-             "2. `use_peepholes = True` "
-             " - The shape is (1 x 7D). "
-             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
-    AddOutput("Hidden",
-              "(LoDTensor) the hidden state of LSTM operator. "
-              "The shape is (T x D), and lod is the same with the `Input`.");
-    AddOutput("Cell",
-              "(LoDTensor) the cell state of LSTM operator. "
-              "The shape is (T x D), and lod is the same with the `Input`.");
-    AddOutput("BatchGate",
-              "(LoDTensor) This LoDTensor contains input gate, forget gate "
-              "and output gate after the nonlinear computation. This "
-              "LoDTensor has the same shape as the reorganized input, which "
-              "is also be called batch input. The LoD size is 2. The first "
-              "LoD is the batch offsets and the second LoD contains the "
-              "indexes, which denote the position of reorganized sequence "
-              "in the raw input.")
-        .AsIntermediate();
-    AddOutput("BatchCellPreAct",
-              "(LoDTensor) This LoDTensor is obtained in the forward and used "
-              "in the backward.")
-        .AsIntermediate();
-    AddAttr<bool>("use_peepholes",
-                  "(bool, defalut: True) "
-                  "whether to enable diagonal/peephole connections.")
-        .SetDefault(true);
-    AddAttr<bool>("is_reverse",
-                  "(bool, defalut: False) "
-                  "whether to compute reversed LSTM.")
-        .SetDefault(false);
-    AddAttr<std::string>(
-        "gate_activation",
-        "(string, default: sigmoid)"
-        "The activation for input gate, forget gate and output "
-        "gate, `sigmoid` by default.")
-        .SetDefault("sigmoid")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddAttr<std::string>("cell_activation",
-                         "(string, default: tanh)"
-                         "The activation for cell output, `tanh` by defalut.")
-        .SetDefault("tanh")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddAttr<std::string>("candidate_activation",
-                         "(string, default: tanh)"
-                         "The activation for candidate hidden state, "
-                         "`tanh` by default.")
-        .SetDefault("tanh")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddComment(R"DOC(
-Long-Short Term Memory (LSTM) Operator.
-
-The defalut implementation is diagonal/peephole connection
-(https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:
-
-$$
-i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) \\
-
-f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) \\
-
-\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) \\
-
-o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) \\
-
-c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
-
-h_t = o_t \odot act_h(c_t)
-$$
-
-where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
-of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
-are diagonal weight matrices for peephole connections. In our implementation,
-we use vectors to reprenset these diagonal weight matrices. The b terms
-denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
-is the non-line activations, such as logistic sigmoid function, and
-$i, f, o$ and $c$ are the input gate, forget gate, output gate,
-and cell activation vectors, respectively, all of which have the same size as
-the cell output activation vector $h$.
-
-The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
-are the cell input and cell output activation functions and `tanh` is usually
-used for them. $\tilde{c_t}$ is also called candidate hidden state,
-which is computed based on the current input and the previous hidden state.
-
-Set `use_peepholes` False to disable peephole connection. The formula
-is omitted here, please refer to the paper
-http://www.bioinf.jku.at/publications/older/2604.pdf for details.
-
-Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
-operations on the input $x_{t}$ are NOT included in this operator.
-Users can choose to use fully-connect operator before LSTM operator.
-
-)DOC");
-  }
-};
-
-class LSTMGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
-                   "Input(Hidden) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Cell"),
-                   "Input(Cell) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(Weight) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Bias"),
-                   "Input(Bias) of LSTM should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
-                   "Input(BatchGate) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"),
-                   "Input(BatchGate) of LSTM should not be null.");
-
-    auto SetOutGradDim = [&ctx](const std::string& name) {
-      auto g_name = framework::GradVarName(name);
-      if (ctx->HasOutput(g_name))
-        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
-    };
-
-    SetOutGradDim("Input");
-    SetOutGradDim("Weight");
-    SetOutGradDim("Bias");
-    SetOutGradDim("H0");
-    SetOutGradDim("C0");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
-        ctx.device_context());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp);
-REGISTER_OP_CPU_KERNEL(
-    lstm, ops::LSTMKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LSTMKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    lstm_grad, ops::LSTMGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LSTMGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/lstm_op.cu.cc b/paddle/operators/lstm_op.cu.cc
deleted file mode 100644
index cfcc1fc92a074c9bfe83e6c32560177edef12ae9..0000000000000000000000000000000000000000
--- a/paddle/operators/lstm_op.cu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/lstm_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    lstm, ops::LSTMKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LSTMKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    lstm_grad, ops::LSTMGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LSTMGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/lstm_op.h b/paddle/operators/lstm_op.h
deleted file mode 100644
index 72e95b75e29c88c5944607ceaa40435bac7a745c..0000000000000000000000000000000000000000
--- a/paddle/operators/lstm_op.h
+++ /dev/null
@@ -1,376 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/operators/math/lstm_compute.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/sequence2batch.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src,
-                             framework::Vector<size_t> index_lod,
-                             framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
-  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index_lod, *dst, indexed_src);
-}
-
-template <typename DeviceContext, typename T>
-class LSTMKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<LoDTensor>("Input");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* bias = ctx.Input<Tensor>("Bias");
-
-    auto* hidden_t0 = ctx.Input<Tensor>("H0");
-    auto* cell_t0 = ctx.Input<Tensor>("C0");
-
-    auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
-    batch_gate->mutable_data<T>(ctx.GetPlace());
-    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
-    hidden_out->mutable_data<T>(ctx.GetPlace());
-    auto* cell_out = ctx.Output<LoDTensor>("Cell");
-    cell_out->mutable_data<T>(ctx.GetPlace());
-
-    bool is_reverse = ctx.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    auto& device_ctx = ctx.template device_context<DeviceContext>();
-    to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
-
-    auto in_dims = input->dims();
-    int frame_size = static_cast<int>(in_dims[1] / 4);
-    framework::DDim dims({in_dims[0], frame_size});
-
-    if (bias) {
-      Tensor b = *bias;
-      b.Resize({bias->numel(), 1});
-      Tensor gate_bias = b.Slice(0, 4 * frame_size);
-      math::RowwiseAdd<DeviceContext, T> add_bias;
-      add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
-    }
-
-    math::LstmMetaValue<T> lstm_value;
-    if (bias && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_data = const_cast<T*>(bias->data<T>());
-      // the code style in LstmMetaValue will be updated later.
-
-      lstm_value.check_ig = bias_data + 4 * frame_size;
-      lstm_value.check_fg = lstm_value.check_ig + frame_size;
-      lstm_value.check_og = lstm_value.check_fg + frame_size;
-    } else {
-      lstm_value.check_ig = nullptr;
-      lstm_value.check_fg = nullptr;
-      lstm_value.check_og = nullptr;
-    }
-    lstm_value.prev_state_value = nullptr;
-    Tensor ordered_c0;
-
-    framework::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (cell_t0) {
-      // Since the batch computing for LSTM reorders the input sequence
-      // according to their length. The initialized cell state also needs
-      // to reorder.
-      ReorderInitState<DeviceContext, T>(device_ctx, *cell_t0, order,
-                                         &ordered_c0, true);
-      lstm_value.prev_state_value = ordered_c0.data<T>();
-    }
-
-    // Use the local variable as here.
-    LoDTensor batch_hidden, batch_cell;
-    auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
-    batch_hidden.mutable_data<T>(dims, ctx.GetPlace());
-    batch_cell.mutable_data<T>(dims, ctx.GetPlace());
-    batch_cell_pre_act->mutable_data<T>(dims, ctx.GetPlace());
-
-    auto batch_starts = batch_gate->lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    auto gate_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("candidate_activation"));
-
-    for (size_t n = 0; n < num_batch; n++) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-
-      Tensor gate_t = batch_gate->Slice(bstart, bend);
-      Tensor out_t = batch_hidden.Slice(bstart, bend);
-      Tensor cell_t = batch_cell.Slice(bstart, bend);
-      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
-
-      int cur_batch_size = bend - bstart;
-
-      if (n > 0) {
-        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
-        int pre_h_end = pre_h_start + cur_batch_size;
-        auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
-        math::matmul<DeviceContext, T>(device_ctx, pre_hidden_t, false, *weight,
-                                       false, static_cast<T>(1.0), &gate_t,
-                                       static_cast<T>(1.0));
-      } else if (hidden_t0) {
-        // If n == 0 and there is no initialized hidden state, that is to say
-        // the H0 is zeros, the calculation W_h * H0 will be skiped.
-        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
-
-        // Since the batch computing for LSTM reorders the input sequence
-        // according to their length. The initialized hidden state also needs
-        // to reorder.
-        Tensor ordered_h0;
-        ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
-                                           &ordered_h0, true);
-        math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false, *weight,
-                                       false, static_cast<T>(1.0), &gate_t,
-                                       static_cast<T>(1.0));
-      }
-
-      lstm_value.gate_value = gate_t.data<T>();
-      lstm_value.output_value = out_t.data<T>();
-      lstm_value.state_value = cell_t.data<T>();
-      lstm_value.state_active_value = cell_pre_act_t.data<T>();
-      math::LstmUnitFunctor<DeviceContext, T>::compute(
-          device_ctx, lstm_value, frame_size, cur_batch_size, gate_act,
-          cell_act, cand_act);
-      lstm_value.prev_state_value = lstm_value.state_value;
-    }
-
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_hidden.set_lod(batch_gate->lod());
-    // restore the output hidden in LoDTensor from the batch hidden
-    to_seq(device_ctx, batch_hidden, *hidden_out);
-
-    batch_cell.set_lod(batch_gate->lod());
-    // restore the output cell state in LoDTensor from the batch cell
-    to_seq(device_ctx, batch_cell, *cell_out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LSTMGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<LoDTensor>("Input");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* bias = ctx.Input<Tensor>("Bias");
-
-    auto* hidden_out = ctx.Input<LoDTensor>("Hidden");
-    auto* cell_out = ctx.Input<LoDTensor>("Cell");
-
-    auto* batch_gate = ctx.Input<LoDTensor>("BatchGate");
-    auto* batch_cell_pre_act = ctx.Input<LoDTensor>("BatchCellPreAct");
-
-    auto* hidden_g = ctx.Input<LoDTensor>(framework::GradVarName("Hidden"));
-
-    auto* in_g = ctx.Output<LoDTensor>(framework::GradVarName("Input"));
-    auto* weight_g = ctx.Output<Tensor>(framework::GradVarName("Weight"));
-    auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    auto* h0 = ctx.Input<Tensor>("H0");
-    auto* c0 = ctx.Input<Tensor>("C0");
-
-    auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
-    auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
-
-    auto& device_ctx = ctx.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
-    if (weight_g) {
-      weight_g->mutable_data<T>(ctx.GetPlace());
-      zero(device_ctx, weight_g, static_cast<T>(0.0));
-    }
-
-    // ordered_h0/c0 is the reordered hidden/cell initialization.
-    // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
-    // initialization.
-    Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
-    framework::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (c0) {
-      ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
-                                         true);
-    }
-    if (c0 && c0_g) {
-      ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
-    }
-
-    auto in_dims = input->dims();
-    auto out_dims = hidden_g->dims();
-    int frame_size = static_cast<int>(in_dims[1] / 4);
-    PADDLE_ENFORCE_EQ(frame_size, out_dims[1]);
-
-    math::LstmMetaValue<T> lstm_value;
-    if (bias && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_data = const_cast<T*>(bias->data<T>());
-      lstm_value.check_ig = bias_data + 4 * frame_size;
-      lstm_value.check_fg = lstm_value.check_ig + frame_size;
-      lstm_value.check_og = lstm_value.check_fg + frame_size;
-    } else {
-      lstm_value.check_ig = nullptr;
-      lstm_value.check_fg = nullptr;
-      lstm_value.check_og = nullptr;
-    }
-
-    math::LstmMetaGrad<T> lstm_grad;
-
-    if (bias && bias_g) {
-      bias_g->mutable_data<T>(ctx.GetPlace());
-      zero(device_ctx, bias_g, static_cast<T>(0.0));
-    }
-    if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_g_data = bias_g->data<T>();
-      lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size;
-      lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size;
-      lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size;
-    } else {
-      lstm_grad.check_ig_grad = nullptr;
-      lstm_grad.check_fg_grad = nullptr;
-      lstm_grad.check_og_grad = nullptr;
-    }
-
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-
-    auto ToBatch = [&batch_gate, &to_batch](
-        const DeviceContext& ctx, const framework::LoDTensor& src,
-        const framework::DDim& dims, framework::LoDTensor& dst) {
-      dst.mutable_data<T>(dims, ctx.GetPlace());
-      dst.set_lod(batch_gate->lod());
-      to_batch(ctx, src, dst, false);
-    };
-
-    LoDTensor batch_hidden, batch_hidden_g, batch_cell;
-    ToBatch(device_ctx, *hidden_out, out_dims, batch_hidden);
-    ToBatch(device_ctx, *hidden_g, out_dims, batch_hidden_g);
-    ToBatch(device_ctx, *cell_out, out_dims, batch_cell);
-
-    LoDTensor batch_cell_g, batch_gate_g;
-    batch_cell_g.mutable_data<T>(out_dims, ctx.GetPlace());
-    // TODO(qingqing) support the case output cell has gradient.
-    // to_batch(device_ctx, *cell_g, batch_cell_g, false);
-    zero(device_ctx, &batch_cell_g, static_cast<T>(0.0));
-    batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
-    batch_gate_g.set_lod(batch_gate->lod());
-
-    auto gate_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("candidate_activation"));
-
-    auto batch_starts = batch_gate->lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-
-      Tensor gate = batch_gate->Slice(bstart, bend);
-      Tensor cell = batch_cell.Slice(bstart, bend);
-      Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
-      lstm_value.gate_value = gate.data<T>();
-      lstm_value.state_value = cell.data<T>();
-      lstm_value.state_active_value = cell_pre_act.data<T>();
-
-      Tensor out_g = batch_hidden_g.Slice(bstart, bend);
-      Tensor gate_g = batch_gate_g.Slice(bstart, bend);
-      Tensor cell_g = batch_cell_g.Slice(bstart, bend);
-      lstm_grad.state_grad = cell_g.data<T>();
-      lstm_grad.gate_grad = gate_g.data<T>();
-      lstm_grad.output_grad = out_g.data<T>();
-
-      if (n > 0) {
-        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
-        Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
-        Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
-        lstm_value.prev_state_value = cell_pre.data<T>();
-        lstm_grad.prev_state_grad = cell_pre_g.data<T>();
-      } else {
-        lstm_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
-        lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
-      }
-
-      int cur_batch_size = bend - bstart;
-      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
-          device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
-          gate_act, cell_act, cand_act);
-
-      if (n > 0) {
-        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
-        int pre_h_end = pre_h_start + cur_batch_size;
-        auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end);
-        math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, true,
-                                       static_cast<T>(1.0), &pre_hidden_g,
-                                       static_cast<T>(1.0));
-        if (weight_g) {
-          /* backward weight */
-          auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end);
-          math::matmul<DeviceContext, T>(device_ctx, pre_hidden, true, gate_g,
-                                         false, static_cast<T>(1.0), weight_g,
-                                         static_cast<T>(1.0));
-        }
-      } else {
-        if (h0 && weight_g) {
-          ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
-                                             &ordered_h0, true);
-          math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true, gate_g,
-                                         false, static_cast<T>(1.0), weight_g,
-                                         static_cast<T>(1.0));
-        }
-        if (h0 && h0_g) {
-          ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
-          math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight,
-                                         true, static_cast<T>(1.0),
-                                         &ordered_h0_g, static_cast<T>(0.0));
-        }
-      }
-    }
-
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    if (in_g) {
-      /* backward data */
-      in_g->mutable_data<T>(ctx.GetPlace());
-      to_seq(device_ctx, batch_gate_g, *in_g);
-    }
-    if (bias && bias_g) {
-      /* backward bias */
-      Tensor b_g = *bias_g;
-      b_g.Resize({bias_g->numel(), 1});
-      Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
-      math::ColwiseSum<DeviceContext, T> col_sum;
-      col_sum(device_ctx, batch_gate_g, &gate_bias_g);
-    }
-
-    if (h0 && h0_g) {
-      ReorderInitState<DeviceContext, T>(device_ctx, ordered_h0_g, order, h0_g,
-                                         false);
-    }
-    if (c0 && c0_g) {
-      ReorderInitState<DeviceContext, T>(device_ctx, ordered_c0_g, order, c0_g,
-                                         false);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/lstm_unit_op.cc b/paddle/operators/lstm_unit_op.cc
deleted file mode 100644
index c2d2c43982580c9724849d68576d42ffa44fc6b4..0000000000000000000000000000000000000000
--- a/paddle/operators/lstm_unit_op.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/lstm_unit_op.h"
-
-namespace paddle {
-namespace operators {
-
-class LstmUnitOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("C_prev"),
-                   "Input(C_prev) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("C"),
-                   "Output(C) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("H"),
-                   "Output(H) of LSTM should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto c_prev_dims = ctx->GetInputDim("C_prev");
-
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], c_prev_dims[0],
-                      "Batch size of inputs and states must be equal");
-    PADDLE_ENFORCE_EQ(x_dims[1], c_prev_dims[1] * 4,
-                      "Dimension of FC should equal to prev state * 4");
-
-    int b_size = c_prev_dims[0];  // batch size
-    int s_dim = c_prev_dims[1];   // state dim
-    ctx->SetOutputDim("C", {b_size, s_dim});
-    ctx->SetOutputDim("H", {b_size, s_dim});
-  }
-};
-
-class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LstmUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "Lstm unit only applies non-linear activations, please make sure"
-             "that linear tranformation has already been applied to `X`. "
-             "Linear tranformation can be applied by adding a `fc` layer");
-    AddInput(
-        "C_prev",
-        "The cell state tensor of last time-step in the Lstm Unit operator.");
-    AddOutput("C", "The cell tensor of Lstm Unit operator.");
-    AddOutput("H", "The hidden state tensor of Lstm Unit operator.");
-    AddAttr<float>("forget_bias",
-                   "(float, default 0.0) "
-                   "The forget bias of Lstm Unit.")
-        .SetDefault(0.0);
-    AddComment(R"DOC(
-Lstm Unit Operator
-
-Equation:
-
-$$
-i, f, o, j = split(X) \\
-C = C_{prev} * sigm(f + forget\_bias) + sigm(i) * tanh(j) \\
-H = C * sigm(o)
-$$
-
-)DOC");
-  }
-};
-
-class LstmUnitGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("C")),
-                   "Input(C@GRAD) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("H")),
-                   "Input(H@GRAD) should not be null");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->SetOutputDim(framework::GradVarName("C_prev"),
-                      ctx->GetInputDim("C_prev"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker, lstm_unit_grad,
-            ops::LstmUnitGradOp);
-REGISTER_OP_CPU_KERNEL(lstm_unit,
-                       ops::LstmUnitKernel<paddle::platform::CPUPlace, float>,
-                       ops::LstmUnitKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(
-    lstm_unit_grad, ops::LstmUnitGradKernel<paddle::platform::CPUPlace, float>,
-    ops::LstmUnitGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/lstm_unit_op.cu b/paddle/operators/lstm_unit_op.cu
deleted file mode 100644
index 5ee5ddd280f7720c4583053e4e48a5043ab423f4..0000000000000000000000000000000000000000
--- a/paddle/operators/lstm_unit_op.cu
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/* Acknowledgement: the following code is strongly inspired by
-https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.cu
-*/
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/cross_entropy_op.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-template <typename Dtype>
-__device__ Dtype cuda_sigmoid(const Dtype x) {
-  return Dtype(1) / (Dtype(1) + exp(-x));
-}
-
-template <typename Dtype>
-__device__ Dtype cuda_tanh(const Dtype x) {
-  return Dtype(1 - exp(-2. * x)) / (Dtype(1) + exp(-2. * x));
-}
-
-template <typename T>
-__global__ void LSTMUnitKernel(const int nthreads, const int dim,
-                               const T* C_prev, const T* X, T* C, T* H,
-                               const T forget_bias) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    const int n = index / dim;
-    const int d = index % dim;
-
-    const T* X_offset = X + 4 * dim * n;
-    const T i = cuda_sigmoid(X_offset[d]);
-    const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias);
-    const T o = cuda_sigmoid(X_offset[2 * dim + d]);
-    const T g = cuda_tanh(X_offset[3 * dim + d]);
-    const T c_prev = C_prev[index];
-    const T c = f * c_prev + i * g;
-    C[index] = c;
-    const T tanh_c = cuda_tanh(c);
-    H[index] = o * tanh_c;
-  }
-}
-
-template <typename T>
-__global__ void LSTMUnitGradientKernel(const int nthreads, const int dim,
-                                       const T* C_prev, const T* X, const T* C,
-                                       const T* H, const T* C_diff,
-                                       const T* H_diff, T* C_prev_diff,
-                                       T* X_diff, const T forget_bias) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    const int n = index / dim;
-    const int d = index % dim;
-    const T* X_offset = X + 4 * dim * n;
-    T* c_prev_diff = C_prev_diff + index;
-    T* X_diff_offset = X_diff + 4 * dim * n;
-    T* i_diff = X_diff_offset + d;
-    T* f_diff = X_diff_offset + 1 * dim + d;
-    T* o_diff = X_diff_offset + 2 * dim + d;
-    T* g_diff = X_diff_offset + 3 * dim + d;
-
-    const T i = cuda_sigmoid(X_offset[d]);
-    const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias);
-    const T o = cuda_sigmoid(X_offset[2 * dim + d]);
-    const T g = cuda_tanh(X_offset[3 * dim + d]);
-    const T c_prev = C_prev[index];
-    const T c = C[index];
-    const T tanh_c = cuda_tanh(c);
-    const T c_term_diff =
-        C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c);
-    *c_prev_diff = c_term_diff * f;
-    *i_diff = c_term_diff * g * i * (1 - i);
-    *f_diff = c_term_diff * c_prev * f * (1 - f);
-    *o_diff = H_diff[index] * tanh_c * o * (1 - o);
-    *g_diff = c_term_diff * i * (1 - g * g);
-  }
-}
-
-template <typename T>
-class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-
-    auto* x_tensor = ctx.Input<framework::Tensor>("X");
-    auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
-    auto* c_tensor = ctx.Output<framework::Tensor>("C");
-    auto* h_tensor = ctx.Output<framework::Tensor>("H");
-
-    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
-
-    int b_size = c_tensor->dims()[0];
-    int D = c_tensor->dims()[1];
-
-    const T* X = x_tensor->data<T>();
-    const T* C_prev = c_prev_tensor->data<T>();
-
-    T* C = c_tensor->mutable_data<T>(ctx.GetPlace());
-    T* H = h_tensor->mutable_data<T>(ctx.GetPlace());
-
-    int block = 512;
-    int n = b_size * D;
-    int grid = (n + block - 1) / block;
-
-    LSTMUnitKernel<T><<<grid, block>>>(n, D, C_prev, X, C, H, forget_bias);
-  }
-};
-
-template <typename T>
-class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-
-    auto x_tensor = ctx.Input<Tensor>("X");
-    auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
-    auto c_tensor = ctx.Input<Tensor>("C");
-    auto h_tensor = ctx.Input<Tensor>("H");
-
-    auto hdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("H"));
-    auto cdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("C"));
-
-    auto xdiff_tensor = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto c_prev_diff_tensor =
-        ctx.Output<Tensor>(framework::GradVarName("C_prev"));
-
-    auto* X = x_tensor->data<T>();
-    auto* C_prev = c_prev_tensor->data<T>();
-    auto* C = c_tensor->data<T>();
-    auto* H = h_tensor->data<T>();
-
-    auto* H_diff = hdiff_tensor->data<T>();
-    auto* C_diff = cdiff_tensor->data<T>();
-
-    auto* C_prev_diff = c_prev_diff_tensor->mutable_data<T>(ctx.GetPlace());
-    auto* X_diff = xdiff_tensor->mutable_data<T>(ctx.GetPlace());
-
-    int N = c_tensor->dims()[0];
-    int D = c_tensor->dims()[1];
-
-    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
-
-    int block = 512;
-    int n = N * D;
-    int grid = (n + block - 1) / block;
-
-    LSTMUnitGradientKernel<T><<<grid, block>>>(n, D, C_prev, X, C, H, C_diff,
-                                               H_diff, C_prev_diff, X_diff,
-                                               forget_bias);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>,
-                        ops::LstmUnitOpCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>,
-                        ops::LstmUnitGradOpCUDAKernel<double>);
diff --git a/paddle/operators/lstm_unit_op.h b/paddle/operators/lstm_unit_op.h
deleted file mode 100644
index fa8d141bcb6ee4bfc9a29e337b7adbc5ecd3ad23..0000000000000000000000000000000000000000
--- a/paddle/operators/lstm_unit_op.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/* Acknowledgement: the following code is strongly inspired by
-https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op.h
-*/
-
-#pragma once
-#include "glog/logging.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-template <typename T>
-inline T sigmoid(T x) {
-  return 1. / (1. + exp(-x));
-}
-
-template <typename T>
-inline T tanh(T x) {
-  return 2. * sigmoid(2. * x) - 1.;
-}
-
-template <typename DeviceContext, typename T>
-class LstmUnitKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto* x_tensor = ctx.Input<framework::Tensor>("X");
-    auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
-    auto* c_tensor = ctx.Output<framework::Tensor>("C");
-    auto* h_tensor = ctx.Output<framework::Tensor>("H");
-
-    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
-
-    int b_size = c_tensor->dims()[0];
-    int D = c_tensor->dims()[1];
-
-    T* C = c_tensor->mutable_data<T>(ctx.GetPlace());
-    T* H = h_tensor->mutable_data<T>(ctx.GetPlace());
-
-    const T* X = x_tensor->data<T>();
-    const T* C_prev = c_prev_tensor->data<T>();
-
-    for (int n = 0; n < b_size; ++n) {
-      for (int d = 0; d < D; ++d) {
-        const T i = sigmoid(X[d]);
-        const T f = sigmoid(X[1 * D + d] + forget_bias);
-        const T o = sigmoid(X[2 * D + d]);
-        const T g = tanh(X[3 * D + d]);
-        const T c_prev = C_prev[d];
-        const T c = f * c_prev + i * g;
-        C[d] = c;
-        const T tanh_c = tanh(c);
-        H[d] = o * tanh_c;
-      }
-      C_prev += D;
-      X += 4 * D;
-      C += D;
-      H += D;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LstmUnitGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto x_tensor = ctx.Input<Tensor>("X");
-    auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
-    auto c_tensor = ctx.Input<Tensor>("C");
-    auto h_tensor = ctx.Input<Tensor>("H");
-
-    auto hdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("H"));
-    auto cdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("C"));
-
-    auto xdiff_tensor = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto c_prev_diff_tensor =
-        ctx.Output<Tensor>(framework::GradVarName("C_prev"));
-
-    auto* X = x_tensor->data<T>();
-    auto* C_prev = c_prev_tensor->data<T>();
-    auto* C = c_tensor->data<T>();
-    auto* H = h_tensor->data<T>();
-
-    auto* H_diff = hdiff_tensor->data<T>();
-    auto* C_diff = cdiff_tensor->data<T>();
-
-    auto* C_prev_diff = c_prev_diff_tensor->mutable_data<T>(ctx.GetPlace());
-    auto* X_diff = xdiff_tensor->mutable_data<T>(ctx.GetPlace());
-
-    int N = c_tensor->dims()[0];
-    int D = c_tensor->dims()[1];
-
-    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
-
-    for (int n = 0; n < N; ++n) {
-      for (int d = 0; d < D; ++d) {
-        T* c_prev_diff = C_prev_diff + d;
-        T* i_diff = X_diff + d;
-        T* f_diff = X_diff + 1 * D + d;
-        T* o_diff = X_diff + 2 * D + d;
-        T* g_diff = X_diff + 3 * D + d;
-
-        const T i = sigmoid(X[d]);
-        const T f = sigmoid(X[1 * D + d] + forget_bias);
-        const T o = sigmoid(X[2 * D + d]);
-        const T g = tanh(X[3 * D + d]);
-        const T c_prev = C_prev[d];
-        const T c = C[d];
-        const T tanh_c = tanh(c);
-        const T c_term_diff = C_diff[d] + H_diff[d] * o * (1 - tanh_c * tanh_c);
-        *c_prev_diff = c_term_diff * f;
-        *i_diff = c_term_diff * g * i * (1 - i);
-        *f_diff = c_term_diff * c_prev * f * (1 - f);
-        *o_diff = H_diff[d] * tanh_c * o * (1 - o);
-        *g_diff = c_term_diff * i * (1 - g * g);
-      }
-      C_prev += D;
-      X += 4 * D;
-      C += D;
-      H += D;
-      C_diff += D;
-      H_diff += D;
-      X_diff += 4 * D;
-      C_prev_diff += D;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/lstmp_op.cc b/paddle/operators/lstmp_op.cc
deleted file mode 100644
index c96b30ba353fabc48630258ea8f88f741b8c415e..0000000000000000000000000000000000000000
--- a/paddle/operators/lstmp_op.cc
+++ /dev/null
@@ -1,331 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/lstmp_op.h"
-
-namespace paddle {
-namespace operators {
-
-class LSTMPOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(Weight) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("ProjWeight"),
-                   "Input(ProjWeight) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Bias"),
-                   "Input(Bias) of LSTMP operator should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("Projection"),
-                   "Output(Projection) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Cell"),
-                   "Output(Cell) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
-                   "Output(BatchGate) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
-                   "Output(BatchCellPreAct) of LSTMP operator should not be "
-                   "null.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
-                   "Output(BatchHidden) of LSTMP operator should not be null.");
-
-    auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2,
-                      "Input(X)'s rank of LSTMP operator must be 2.");
-
-    int frame_size = in_dims[1] / 4;
-    auto w_dims = ctx->GetInputDim("Weight");
-    auto proj_dims = ctx->GetInputDim("ProjWeight");
-    PADDLE_ENFORCE_EQ(w_dims.size(), 2,
-                      "The rank of Input(Weight) should be 2.");
-    PADDLE_ENFORCE_EQ(w_dims[0], proj_dims[1],
-                      "The first dimension of Input(Weight) "
-                      "should be %d.",
-                      proj_dims[1]);
-    PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size,
-                      "The second dimension of Input(Weight) "
-                      "should be 4 * %d.",
-                      frame_size);
-
-    PADDLE_ENFORCE_EQ(proj_dims.size(), 2,
-                      "The rank of Input(ProjWeight) should be 2.");
-    PADDLE_ENFORCE_EQ(proj_dims[0], frame_size,
-                      "The first dimension of Input(ProjWeight) "
-                      "should be %d.",
-                      frame_size);
-
-    if (ctx->HasInput("H0")) {
-      PADDLE_ENFORCE(ctx->HasInput("C0"),
-                     "Input(C0) of LSTMP operator should not be null after "
-                     "Input(H0) provided.");
-      auto h_dims = ctx->GetInputDim("H0");
-      auto c_dims = ctx->GetInputDim("C0");
-      PADDLE_ENFORCE(h_dims == c_dims,
-                     "The dimension of Input(H0) and Input(C0) "
-                     "should be the same.");
-      ctx->SetOutputDim("OrderedP0", {h_dims[0], proj_dims[1]});
-    }
-
-    auto b_dims = ctx->GetInputDim("Bias");
-    PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
-    PADDLE_ENFORCE_EQ(b_dims[0], 1,
-                      "The first dimension of Input(Bias) should be 1.");
-
-    if (ctx->Attrs().Get<bool>("use_peepholes")) {
-      PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
-                        "The second dimension of Input(Bias) should be "
-                        "7 * %d if enable peepholes connection",
-                        frame_size);
-    } else {
-      PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
-                        "The second dimension of Input(Bias) should be "
-                        "4 * %d if disable peepholes connection",
-                        frame_size);
-    }
-
-    framework::DDim out_dims({in_dims[0], frame_size});
-    framework::DDim proj_out_dims({in_dims[0], proj_dims[1]});
-    ctx->SetOutputDim("Projection", proj_out_dims);
-    ctx->SetOutputDim("Cell", out_dims);
-    ctx->SetOutputDim("BatchGate", in_dims);
-    ctx->SetOutputDim("BatchCellPreAct", out_dims);
-    ctx->SetOutputDim("BatchHidden", out_dims);
-    ctx->ShareLoD("Input", "Projection");
-    ctx->ShareLoD("Input", "Cell");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
-        ctx.device_context());
-  }
-};
-
-class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LSTMPOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Input",
-             "(LoDTensor) the input for sequence data, which supports "
-             "variable-time length input sequence. The underlying tensor in "
-             "this LoDTensor is a matrix with shape (T X 4D), where T is the "
-             "total time steps in this mini-batch, D is the hidden size.");
-    AddInput("H0",
-             "(Tensor, optional) the initial hidden state is an optional "
-             "input. This is a tensor with shape (N x D), where N is the "
-             "batch size and D is the hidden size.")
-        .AsDispensable();
-    AddInput("C0",
-             "(Tensor, optional) the initial cell state is an optional "
-             "input. This is a tensor with shape (N x D), where N is the "
-             "batch size. `C0` should not be null if `H0` provided.")
-        .AsDispensable();
-    AddInput("Weight",
-             "(Tensor) the learnable hidden-hidden weights."
-             " - The shape is (P x 4D), where P is the projection layer size "
-             "and  D is the hidden size."
-             " - Weight = {W_cr, W_ir, W_fr, W_or}");
-    AddInput("ProjWeight",
-             "(Tensor) the learnable weight of the projection layer."
-             " - The shape is (D x P), where P is the recurrent projection "
-             "layer size and  D is the hidden size."
-             " - ProjWeight = {W_rh}");
-    AddInput("Bias",
-             "(Tensor) the learnable biases, which contains two parts: "
-             "input-hidden biases and peephole connections weights if "
-             "setting `use_peepholes` to `True`. "
-             "1. `use_peepholes = False` "
-             " - The shape is (1 x 4D). "
-             " - Bias = {b_c, b_i, b_f, b_o}."
-             "2. `use_peepholes = True` "
-             " - The shape is (1 x 7D). "
-             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
-    AddOutput("Projection",
-              "(LoDTensor) the projection of the hidden state of LSTMP "
-              "operator. The shape is (T x P), and LoD is the same with the "
-              "`Input`.");
-    AddOutput("Cell",
-              "(LoDTensor) the cell state of LSTMP operator. "
-              "The shape is (T x D), and lod is the same with the `Input`.");
-    AddOutput("BatchGate",
-              "(LoDTensor) This LoDTensor contains input gate, forget gate "
-              "and output gate after the activations. This LoDTensor has the "
-              "same shape as the reorganized input, which is also be called "
-              "batch input. The LoD size is 2. The first-level LoD is the "
-              "batch offsets and the second contains the indices, which "
-              "denotes the position of reorganized sequence in the raw input.")
-        .AsIntermediate();
-    AddOutput("BatchCellPreAct",
-              "(LoDTensor) the pre-activation cell state reorganized in batch. "
-              "This LoDTensor is obtained in the forward and used in the "
-              "backward.")
-        .AsIntermediate();
-    AddOutput("BatchHidden",
-              "(LoDTensor) the hidden state reorganized in batch. "
-              "This LoDTensor is obtained in the forward and used in the "
-              "backward.")
-        .AsIntermediate();
-    AddOutput("OrderedP0",
-              "(Tensor) the projection of the initial hidden state "
-              "H0. This is a tensor with shape (N x P), where N is the "
-              "batch size and P is the hidden size.")
-        .AsIntermediate();
-    AddAttr<bool>("use_peepholes",
-                  "(bool, defalut: True) "
-                  "whether to enable diagonal/peephole connections.")
-        .SetDefault(true);
-    AddAttr<bool>("is_reverse",
-                  "(bool, defalut: False) "
-                  "whether to compute reversed LSTMP.")
-        .SetDefault(false);
-    AddAttr<std::string>(
-        "gate_activation",
-        "(string, default: sigmoid)"
-        "The activation for input gate, forget gate and output "
-        "gate, `sigmoid` by default.")
-        .SetDefault("sigmoid")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddAttr<std::string>("cell_activation",
-                         "(string, default: tanh)"
-                         "The activation for cell output, `tanh` by defalut.")
-        .SetDefault("tanh")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddAttr<std::string>("candidate_activation",
-                         "(string, default: tanh)"
-                         "The activation for candidate hidden state, "
-                         "`tanh` by default.")
-        .SetDefault("tanh")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddAttr<std::string>("proj_activation",
-                         "(string, default: tanh)"
-                         "The activation for projection output, "
-                         "`tanh` by defalut.")
-        .SetDefault("tanh")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddComment(R"DOC(
-Long-Short Term Memory with recurrent Projection layer (LSTMP) Operator.
-
-LSTMP has a separate projection layer after the LSTM layer, projecting the 
-original hidden state to a lower-dimensional one, which is proposed to reduce 
-the number of total parameters and furthermore computational complexity for 
-the LSTM, espeacially for the case that the size of output units is relative 
-large (https://research.google.com/pubs/archive/43905.pdf). 
-
-The formula is as follows:
-
-$$
-i_t = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i) \\
-
-f_t = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f) \\
-
-\tilde{c_t} = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c) \\
-
-o_t = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o) \\
-
-c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
-
-h_t = o_t \odot act_h(c_t) \\
-
-r_t = \overline{act_h}(W_{rh}h_t)
-$$
-
-where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
-of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
-are diagonal weight matrices for peephole connections. In our implementation,
-we use vectors to reprenset these diagonal weight matrices. The b terms
-denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
-is the activation, such as logistic sigmoid function, and
-$i, f, o$ and $c$ are the input gate, forget gate, output gate,
-and cell activation vectors, respectively, all of which have the same size as
-the cell output activation vector $h$. Here $h$ is usually called the hidden 
-state and $r$ denotes its recurrent projection. And $\tilde{c_t}$ is also 
-called the candidate hidden state, whose computation is based on the current 
-input and previous hidden state.
-
-The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
-are the cell input and cell output activation functions and `tanh` is usually
-used for them. $\overline{act_h}$ is the activation function for the 
-projection output, usually using `identity` or same as $act_h$.
-
-Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
-operations on the input $x_{t}$ are NOT included in this operator.
-Users can choose to use fully-connected operator before LSTMP operator.
-
-)DOC");
-  }
-};
-
-class LSTMPGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Projection"),
-                   "Input(Projection) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Cell"),
-                   "Input(Cell) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(Weight) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("ProjWeight"),
-                   "Input(ProjWeight) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Bias"),
-                   "Input(Bias) of LSTMP operator should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
-                   "Input(BatchGate) of LSTMP operator should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"),
-                   "Input(BatchGate) of LSTMP operator should not be null.");
-
-    auto SetOutGradDim = [&ctx](const std::string& name) {
-      auto g_name = framework::GradVarName(name);
-      if (ctx->HasOutput(g_name))
-        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
-    };
-
-    SetOutGradDim("Input");
-    SetOutGradDim("Weight");
-    SetOutGradDim("ProjWeight");
-    SetOutGradDim("Bias");
-    SetOutGradDim("H0");
-    SetOutGradDim("C0");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
-        ctx.device_context());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, lstmp_grad,
-            ops::LSTMPGradOp);
-REGISTER_OP_CPU_KERNEL(
-    lstmp, ops::LSTMPKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LSTMPKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    lstmp_grad, ops::LSTMPGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LSTMPGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/lstmp_op.cu b/paddle/operators/lstmp_op.cu
deleted file mode 100644
index 7fcbcfecc871976fdfbfffbbb4e0243b91351a29..0000000000000000000000000000000000000000
--- a/paddle/operators/lstmp_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/lstmp_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    lstmp, ops::LSTMPKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LSTMPKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    lstmp_grad,
-    ops::LSTMPGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LSTMPGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/lstmp_op.h b/paddle/operators/lstmp_op.h
deleted file mode 100644
index e064a155dfadd8104fa80727a962cb2e24ade29f..0000000000000000000000000000000000000000
--- a/paddle/operators/lstmp_op.h
+++ /dev/null
@@ -1,496 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/operators/activation_op.h"
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/operators/math/lstm_compute.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/sequence2batch.h"
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src,
-                             framework::Vector<size_t> index,
-                             framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
-  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index, *dst, indexed_src);
-}
-
-template <typename DeviceContext, typename T>
-class LSTMPKernel : public framework::OpKernel<T> {
- public:
-  template <typename Device, typename X, typename Y>
-  void ActCompute(const math::detail::ActivationType act_type, const Device& d,
-                  X x, Y y) const {
-    if (act_type == math::detail::ActivationType::kIdentity)
-      y.device(d) = x;
-    else if (act_type == math::detail::ActivationType::kSigmoid)
-      SigmoidFunctor<T>()(d, x, y);
-    else if (act_type == math::detail::ActivationType::kTanh)
-      TanhFunctor<T>()(d, x, y);
-    else if (act_type == math::detail::ActivationType::kReLU)
-      ReluFunctor<T>()(d, x, y);
-    else
-      PADDLE_THROW("unsupported activation type");
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<LoDTensor>("Input");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
-    auto* bias = ctx.Input<Tensor>("Bias");
-
-    auto* hidden_t0 = ctx.Input<Tensor>("H0");
-    auto* ordered_proj0 = ctx.Output<Tensor>("OrderedP0");
-    auto* cell_t0 = ctx.Input<Tensor>("C0");
-
-    auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
-    batch_gate->mutable_data<T>(ctx.GetPlace());
-    auto* proj_out = ctx.Output<LoDTensor>("Projection");
-    proj_out->mutable_data<T>(ctx.GetPlace());
-    auto* cell_out = ctx.Output<LoDTensor>("Cell");
-    cell_out->mutable_data<T>(ctx.GetPlace());
-
-    bool is_reverse = ctx.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    auto& device_ctx = ctx.template device_context<DeviceContext>();
-    to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
-
-    auto in_dims = input->dims();
-    int frame_size = static_cast<int>(in_dims[1] / 4);
-    framework::DDim dims({in_dims[0], frame_size});
-    framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
-
-    if (bias) {
-      Tensor b = *bias;
-      b.Resize({bias->numel(), 1});
-      Tensor gate_bias = b.Slice(0, 4 * frame_size);
-      math::RowwiseAdd<DeviceContext, T> add_bias;
-      add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
-    }
-
-    math::LstmMetaValue<T> lstmp_value;
-    if (bias && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_data = const_cast<T*>(bias->data<T>());
-      // the code style in LstmpMetaValue will be updated later.
-
-      lstmp_value.check_ig = bias_data + 4 * frame_size;
-      lstmp_value.check_fg = lstmp_value.check_ig + frame_size;
-      lstmp_value.check_og = lstmp_value.check_fg + frame_size;
-    } else {
-      lstmp_value.check_ig = nullptr;
-      lstmp_value.check_fg = nullptr;
-      lstmp_value.check_og = nullptr;
-    }
-    lstmp_value.prev_state_value = nullptr;
-    Tensor ordered_c0;
-
-    framework::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (cell_t0) {
-      // Since the batch computing for LSTMP reorders the input sequence
-      // according to their length. The initialized cell state also needs
-      // to reorder.
-      ReorderInitState<DeviceContext, T>(device_ctx, *cell_t0, order,
-                                         &ordered_c0, true);
-      lstmp_value.prev_state_value = ordered_c0.data<T>();
-    }
-
-    // Use the local variable as here.
-    LoDTensor batch_proj, batch_cell;
-    auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
-    batch_cell_pre_act->mutable_data<T>(dims, ctx.GetPlace());
-    auto* batch_hidden = ctx.Output<LoDTensor>("BatchHidden");
-    batch_hidden->mutable_data<T>(dims, ctx.GetPlace());    // T x D
-    batch_proj.mutable_data<T>(proj_dims, ctx.GetPlace());  // T x P
-    batch_cell.mutable_data<T>(dims, ctx.GetPlace());       // T x D
-
-    auto batch_starts = batch_gate->lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    auto gate_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("candidate_activation"));
-    auto proj_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("proj_activation"));
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    for (size_t n = 0; n < num_batch; n++) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-
-      Tensor gate_t = batch_gate->Slice(bstart, bend);
-      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-      Tensor proj_t = batch_proj.Slice(bstart, bend);
-      Tensor cell_t = batch_cell.Slice(bstart, bend);
-      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
-
-      int cur_batch_size = bend - bstart;
-
-      if (n > 0) {
-        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
-        int pre_h_end = pre_h_start + cur_batch_size;
-        auto pre_proj_t = batch_proj.Slice(pre_h_start, pre_h_end);
-        math::matmul<DeviceContext, T>(device_ctx, pre_proj_t, false, *weight,
-                                       false, static_cast<T>(1.0), &gate_t,
-                                       static_cast<T>(1.0));
-      } else if (hidden_t0) {
-        // If n == 0 and there is no initialized hidden state, that is to say
-        // the H0 is zeros, the calculation W_h * H0 will be skiped.
-        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
-
-        // Since the batch computing for LSTMP reorders the input sequence
-        // according to their length. The initialized hidden state also needs
-        // to reorder.
-
-        Tensor ordered_h0;
-        ordered_proj0->mutable_data<T>(ctx.GetPlace());
-        ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
-                                           &ordered_h0, true);
-        math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false,
-                                       *proj_weight, false, static_cast<T>(1.0),
-                                       ordered_proj0, static_cast<T>(0.0));
-        if (proj_act != math::detail::ActivationType::kIdentity) {
-          auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
-          ActCompute(cell_act, place, proj0_dev, proj0_dev);
-        }
-        math::matmul<DeviceContext, T>(device_ctx, *ordered_proj0, false,
-                                       *weight, false, static_cast<T>(1.0),
-                                       &gate_t, static_cast<T>(1.0));
-      }
-
-      lstmp_value.gate_value = gate_t.data<T>();
-      lstmp_value.output_value = hidden_t.data<T>();
-      lstmp_value.state_value = cell_t.data<T>();
-      lstmp_value.state_active_value = cell_pre_act_t.data<T>();
-      math::LstmUnitFunctor<DeviceContext, T>::compute(
-          device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act,
-          cell_act, cand_act);
-      lstmp_value.prev_state_value = lstmp_value.state_value;
-      math::matmul<DeviceContext, T>(device_ctx, hidden_t, false, *proj_weight,
-                                     false, static_cast<T>(1.0), &proj_t,
-                                     static_cast<T>(0.0));
-      if (proj_act != math::detail::ActivationType::kIdentity) {
-        auto proj_t_dev = EigenMatrix<T>::From(proj_t);
-        ActCompute(cell_act, place, proj_t_dev, proj_t_dev);
-      }
-    }
-
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_proj.set_lod(batch_gate->lod());
-    // restore the output hidden in LoDTensor from the batch hidden
-    to_seq(device_ctx, batch_proj, *proj_out);
-
-    batch_cell.set_lod(batch_gate->lod());
-    // restore the output cell state in LoDTensor from the batch cell
-    to_seq(device_ctx, batch_cell, *cell_out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LSTMPGradKernel : public framework::OpKernel<T> {
- public:
-  template <typename Device, typename X, typename Y, typename DX, typename DY>
-  void ActGradCompute(const math::detail::ActivationType act_type,
-                      const Device& d, X x, Y y, DX dx, DY dy) const {
-    // x is dummy and won't be used even in Relu(use y instead)
-    if (act_type == math::detail::ActivationType::kIdentity)
-      dx.device(d) = dy;
-    else if (act_type == math::detail::ActivationType::kSigmoid)
-      SigmoidGradFunctor<T>()(d, x, y, dy, dx);
-    else if (act_type == math::detail::ActivationType::kTanh)
-      TanhGradFunctor<T>()(d, x, y, dy, dx);
-    else if (act_type == math::detail::ActivationType::kReLU)
-      ReluGradFunctor<T>()(d, x, y, dy, dx);
-    else
-      PADDLE_THROW("unsupported activation type");
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<LoDTensor>("Input");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
-    auto* bias = ctx.Input<Tensor>("Bias");
-
-    auto* proj_out = ctx.Input<LoDTensor>("Projection");
-    auto* cell_out = ctx.Input<LoDTensor>("Cell");
-
-    auto* batch_gate = ctx.Input<LoDTensor>("BatchGate");
-    auto* batch_cell_pre_act = ctx.Input<LoDTensor>("BatchCellPreAct");
-    auto* batch_hidden = ctx.Input<LoDTensor>("BatchHidden");
-
-    auto* projection_g =
-        ctx.Input<LoDTensor>(framework::GradVarName("Projection"));
-
-    auto* in_g = ctx.Output<LoDTensor>(framework::GradVarName("Input"));
-    auto* weight_g = ctx.Output<Tensor>(framework::GradVarName("Weight"));
-    auto* proj_weight_g =
-        ctx.Output<Tensor>(framework::GradVarName("ProjWeight"));
-    auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    auto* h0 = ctx.Input<Tensor>("H0");
-    auto* ordered_proj0 = ctx.Input<Tensor>("OrderedP0");
-    auto* c0 = ctx.Input<Tensor>("C0");
-
-    auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
-    auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
-
-    auto& device_ctx = ctx.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
-    if (weight_g) {
-      weight_g->mutable_data<T>(ctx.GetPlace());
-      zero(device_ctx, weight_g, static_cast<T>(0.0));
-    }
-    if (proj_weight_g) {
-      proj_weight_g->mutable_data<T>(ctx.GetPlace());
-      zero(device_ctx, proj_weight_g, static_cast<T>(0.0));
-    }
-
-    // ordered_h0/c0 is the reordered hidden/cell initialization.
-    // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
-    // initialization.
-    Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
-
-    framework::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (c0) {
-      ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
-                                         true);
-    }
-    if (c0 && c0_g) {
-      ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
-    }
-
-    auto in_dims = input->dims();
-    auto out_dims = cell_out->dims();
-    framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
-    int frame_size = static_cast<int>(in_dims[1] / 4);
-    PADDLE_ENFORCE_EQ(frame_size, out_dims[1]);
-
-    math::LstmMetaValue<T> lstmp_value;
-    if (bias && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_data = const_cast<T*>(bias->data<T>());
-      lstmp_value.check_ig = bias_data + 4 * frame_size;
-      lstmp_value.check_fg = lstmp_value.check_ig + frame_size;
-      lstmp_value.check_og = lstmp_value.check_fg + frame_size;
-    } else {
-      lstmp_value.check_ig = nullptr;
-      lstmp_value.check_fg = nullptr;
-      lstmp_value.check_og = nullptr;
-    }
-
-    math::LstmMetaGrad<T> lstmp_grad;
-
-    if (bias && bias_g) {
-      bias_g->mutable_data<T>(ctx.GetPlace());
-      zero(device_ctx, bias_g, static_cast<T>(0.0));
-    }
-    if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_g_data = bias_g->data<T>();
-      lstmp_grad.check_ig_grad = bias_g_data + 4 * frame_size;
-      lstmp_grad.check_fg_grad = lstmp_grad.check_ig_grad + frame_size;
-      lstmp_grad.check_og_grad = lstmp_grad.check_fg_grad + frame_size;
-    } else {
-      lstmp_grad.check_ig_grad = nullptr;
-      lstmp_grad.check_fg_grad = nullptr;
-      lstmp_grad.check_og_grad = nullptr;
-    }
-
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-
-    auto ToBatch = [&batch_gate, &to_batch](
-        const DeviceContext& ctx, const framework::LoDTensor& src,
-        const framework::DDim& dims, framework::LoDTensor& dst) {
-      dst.mutable_data<T>(dims, ctx.GetPlace());
-      dst.set_lod(batch_gate->lod());
-      to_batch(ctx, src, dst, false);
-    };
-
-    LoDTensor batch_hidden_g, batch_proj, batch_proj_g, batch_cell;
-    batch_hidden_g.mutable_data<T>(out_dims, ctx.GetPlace());
-    ToBatch(device_ctx, *proj_out, proj_dims, batch_proj);        // T x P
-    ToBatch(device_ctx, *projection_g, proj_dims, batch_proj_g);  // T x P
-    ToBatch(device_ctx, *cell_out, out_dims, batch_cell);         // T x D
-
-    LoDTensor batch_cell_g, batch_gate_g;
-    batch_cell_g.mutable_data<T>(out_dims, ctx.GetPlace());
-    // TODO(qingqing) support the case output cell has gradient.
-    // to_batch(device_ctx, *cell_g, batch_cell_g, false);
-    zero(device_ctx, &batch_cell_g, static_cast<T>(0.0));
-    batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
-    batch_gate_g.set_lod(batch_gate->lod());
-
-    auto gate_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("candidate_activation"));
-    auto proj_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("proj_activation"));
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    auto batch_starts = batch_gate->lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-
-      Tensor cur_proj = batch_proj.Slice(bstart, bend);
-      Tensor proj_g = batch_proj_g.Slice(bstart, bend);
-      if (proj_act != math::detail::ActivationType::kIdentity) {
-        auto cur_proj_dev = EigenMatrix<T>::From(cur_proj);
-        auto proj_g_dev = EigenMatrix<T>::From(proj_g);
-        ActGradCompute(cell_act, place, cur_proj_dev, cur_proj_dev, proj_g_dev,
-                       proj_g_dev);
-      }
-      /* hidden state backwarad */
-      Tensor out_g = batch_hidden_g.Slice(bstart, bend);
-      math::matmul<DeviceContext, T>(device_ctx, proj_g, false, *proj_weight,
-                                     true, static_cast<T>(1.0), &out_g,
-                                     static_cast<T>(0.0));
-      /* projection weight backward*/
-      if (proj_weight_g) {
-        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-        math::matmul<DeviceContext, T>(device_ctx, hidden_t, true, proj_g,
-                                       false, static_cast<T>(1.0),
-                                       proj_weight_g, static_cast<T>(1.0));
-      }
-
-      Tensor gate = batch_gate->Slice(bstart, bend);
-      Tensor cell = batch_cell.Slice(bstart, bend);
-      Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
-      lstmp_value.gate_value = gate.data<T>();
-      lstmp_value.state_value = cell.data<T>();
-      lstmp_value.state_active_value = cell_pre_act.data<T>();
-
-      Tensor gate_g = batch_gate_g.Slice(bstart, bend);
-      Tensor cell_g = batch_cell_g.Slice(bstart, bend);
-      lstmp_grad.state_grad = cell_g.data<T>();
-      lstmp_grad.gate_grad = gate_g.data<T>();
-      lstmp_grad.output_grad = out_g.data<T>();
-
-      if (n > 0) {
-        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
-        Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
-        Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
-        lstmp_value.prev_state_value = cell_pre.data<T>();
-        lstmp_grad.prev_state_grad = cell_pre_g.data<T>();
-      } else {
-        lstmp_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
-        lstmp_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
-      }
-
-      int cur_batch_size = bend - bstart;
-      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
-          device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
-          gate_act, cell_act, cand_act);
-
-      if (n > 0) {
-        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
-        int pre_h_end = pre_h_start + cur_batch_size;
-        auto pre_proj_g = batch_proj_g.Slice(pre_h_start, pre_h_end);
-        math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, true,
-                                       static_cast<T>(1.0), &pre_proj_g,
-                                       static_cast<T>(1.0));
-        if (weight_g) {
-          /* weight backward*/
-          auto pre_proj = batch_proj.Slice(pre_h_start, pre_h_end);
-          math::matmul<DeviceContext, T>(device_ctx, pre_proj, true, gate_g,
-                                         false, static_cast<T>(1.0), weight_g,
-                                         static_cast<T>(1.0));
-        }
-      } else {
-        if (h0 && weight_g) {
-          ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
-                                             &ordered_h0, true);
-          if (weight_g) {
-            math::matmul<DeviceContext, T>(device_ctx, *ordered_proj0, true,
-                                           gate_g, false, static_cast<T>(1.0),
-                                           weight_g, static_cast<T>(1.0));
-          }
-        }
-        if (h0 && (h0_g || proj_weight_g)) {
-          ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
-          Tensor proj0_g;
-          proj0_g.Resize({in_dims[0], proj_weight->dims()[1]});
-          proj0_g.mutable_data<T>(ctx.GetPlace());
-          math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight,
-                                         true, static_cast<T>(1.0), &proj0_g,
-                                         static_cast<T>(0.0));
-          if (proj_act != math::detail::ActivationType::kIdentity) {
-            auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
-            auto proj0_g_dev = EigenMatrix<T>::From(proj0_g);
-            ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev,
-                           proj0_g_dev);
-          }
-          if (h0_g) {
-            math::matmul<DeviceContext, T>(
-                device_ctx, proj0_g, false, *proj_weight, true,
-                static_cast<T>(1.0), &ordered_h0_g, static_cast<T>(0.0));
-          }
-          if (proj_weight_g) {
-            math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true,
-                                           proj0_g, false, static_cast<T>(1.0),
-                                           proj_weight_g, static_cast<T>(1.0));
-          }
-        }
-      }
-    }
-
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    if (in_g) {
-      /* backward data */
-      in_g->mutable_data<T>(ctx.GetPlace());
-      to_seq(device_ctx, batch_gate_g, *in_g);
-    }
-    if (bias && bias_g) {
-      /* backward bias */
-      Tensor b_g = *bias_g;
-      b_g.Resize({bias_g->numel(), 1});
-      Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
-      math::ColwiseSum<DeviceContext, T> col_sum;
-      col_sum(device_ctx, batch_gate_g, &gate_bias_g);
-    }
-
-    if (h0 && h0_g) {
-      ReorderInitState<DeviceContext, T>(device_ctx, ordered_h0_g, order, h0_g,
-                                         false);
-    }
-    if (c0 && c0_g) {
-      ReorderInitState<DeviceContext, T>(device_ctx, ordered_c0_g, order, c0_g,
-                                         false);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/margin_rank_loss_op.cc b/paddle/operators/margin_rank_loss_op.cc
deleted file mode 100644
index e0df3077742bc330ce8510bf06b0411148f669d8..0000000000000000000000000000000000000000
--- a/paddle/operators/margin_rank_loss_op.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/margin_rank_loss_op.h"
-
-namespace paddle {
-namespace operators {
-
-class MarginRankLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    // input check
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("X1"), "Input(X1) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("X2"), "Input(X2) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null.");
-    auto label_dims = ctx->GetInputDim("Label");
-    auto x1_dims = ctx->GetInputDim("X1");
-    auto x2_dims = ctx->GetInputDim("X2");
-    PADDLE_ENFORCE(
-        (label_dims == x1_dims) && (x1_dims == x2_dims) &&
-            (label_dims.size() == 2) && (label_dims[1] == 1),
-        "All inputs must be 2-D tensor with shape [batch_size x 1].");
-    ctx->SetOutputDim("Activated", label_dims);
-    ctx->SetOutputDim("Out", label_dims);
-  }
-};
-
-template <typename T>
-class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  MarginRankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X1",
-             "(2-D tensor with shape [batch_size x 1]) The score for "
-             "one item X1 to be ranked, from pairwise ranking model.");
-    AddInput("X2",
-             "(2-D tensor with shape [batch_size x 1]) The score for "
-             "another item X2 to be ranked, from pairwise ranking model.");
-    AddInput("Label",
-             "(2-D tensor with shape [batch_size x 1]) "
-             "The label indicating X1 ranked higher than X2 or not, "
-             "can only be +1 or -1.");
-    AddOutput("Activated",
-              "(2-D tensor with shape [batch_size x 1]) Intermediate tensor "
-              "to indicate whether each element of Output(Out) is activated.")
-        .AsIntermediate();
-    AddOutput("Out",
-              "(2-D tensor with shape [batch_size x 1]) "
-              "The output loss of MarginRankLoss operator.");
-    AddAttr<T>("margin", "(scalar, default 0) Margin for MarginRankLossOp.")
-        .SetDefault(static_cast<T>(0));
-    AddComment(R"DOC(
-MarginRankLoss Operator.
-
-This operator measures the loss given a pair of training sample
-{`X1`, `X2`} and the `Label` with attribute `margin`, where `Label = +1` 
-indicating X1 is ranked higher than `X2` and `Label = -1` otherwise. The loss 
-is calculated as:
-
-$loss(X1, X2, Label) = \max(0, -Label * (X1 - X2) + margin)$
-
-The attribute `margin` here helps make the predictions more robust.
-Denote the item ranked higher as the positive sample, otherwise the negative 
-sample. If the score of the two samples satisfies 
-
-$positive sample - negative sample < margin$
-
-the pair of samples will contribute to the final loss, which will backpropagate 
-and train the ranking model to enlarge the difference between the two scores.
-
-For batch input with size `batch_size`, `X1`, `X2` and `Label`
-all have the same shape [batch_size x 1].
-
-)DOC");
-  }
-};
-
-class MarginRankLossGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("X1"), "Input(X1) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("X2"), "Input(X2) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Activated"),
-                   "Intermediate(Activated) shouldn't be null.");
-    auto dims = ctx->GetInputDim("Label");
-    ctx->SetOutputDim(framework::GradVarName("X1"), dims);
-    ctx->SetOutputDim(framework::GradVarName("X2"), dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OP(margin_rank_loss, ops::MarginRankLossOp,
-            ops::MarginRankLossOpMaker<float>, margin_rank_loss_grad,
-            ops::MarginRankLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    margin_rank_loss,
-    ops::MarginRankLossKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    margin_rank_loss_grad,
-    ops::MarginRankLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/margin_rank_loss_op.cu b/paddle/operators/margin_rank_loss_op.cu
deleted file mode 100644
index 798c3ed182b08b07a779da88924bfc05743c680e..0000000000000000000000000000000000000000
--- a/paddle/operators/margin_rank_loss_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/margin_rank_loss_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    margin_rank_loss,
-    ops::MarginRankLossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    margin_rank_loss_grad,
-    ops::MarginRankLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/margin_rank_loss_op.h b/paddle/operators/margin_rank_loss_op.h
deleted file mode 100644
index 7438e881e1c69c9ef1f84b21e6ee0ba093f3378a..0000000000000000000000000000000000000000
--- a/paddle/operators/margin_rank_loss_op.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct ReLU {
-  HOSTDEVICE T operator()(const T& val) const {
-    return val > 0 ? val : static_cast<T>(0);
-  }
-};
-
-template <typename T>
-struct Heaviside {
-  HOSTDEVICE T operator()(const T& val) const {
-    return static_cast<T>(val > 0 ? 1 : 0);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MarginRankLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_t = ctx.Output<framework::Tensor>("Out");
-    auto* act_t = ctx.Output<framework::Tensor>("Activated");
-
-    auto* label_t = ctx.Input<framework::Tensor>("Label");
-    auto* x1_t = ctx.Input<framework::Tensor>("X1");
-    auto* x2_t = ctx.Input<framework::Tensor>("X2");
-
-    out_t->mutable_data<T>(ctx.GetPlace());
-    act_t->mutable_data<T>(ctx.GetPlace());
-
-    auto margin = static_cast<T>(ctx.Attr<T>("margin"));
-    auto out = framework::EigenVector<T>::Flatten(*out_t);
-    auto act = framework::EigenVector<T>::Flatten(*act_t);
-
-    auto label = framework::EigenVector<T>::Flatten(*label_t);
-    auto x1 = framework::EigenVector<T>::Flatten(*x1_t);
-    auto x2 = framework::EigenVector<T>::Flatten(*x2_t);
-
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    out.device(dev) = (-label * (x1 - x2) + margin).unaryExpr(ReLU<T>());
-    act.device(dev) = out.unaryExpr(Heaviside<T>());
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MarginRankLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_x1_t =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X1"));
-    auto* d_x2_t =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X2"));
-
-    auto* act_t = ctx.Input<framework::Tensor>("Activated");
-    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* label_t = ctx.Input<framework::Tensor>("Label");
-
-    auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
-    auto act = framework::EigenVector<T>::Flatten(*act_t);
-    auto label = framework::EigenVector<T>::Flatten(*label_t);
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    // compute d_x1
-    if (d_x1_t) {
-      d_x1_t->mutable_data<T>(ctx.GetPlace());
-      auto d_x1 = framework::EigenVector<T>::Flatten(*d_x1_t);
-      d_x1.device(dev) = -d_out * act * label;
-    }
-    // compute d_x2
-    if (d_x2_t) {
-      d_x2_t->mutable_data<T>(ctx.GetPlace());
-      auto d_x2 = framework::EigenVector<T>::Flatten(*d_x2_t);
-      d_x2.device(dev) = d_out * act * label;
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/context_project.cc b/paddle/operators/math/context_project.cc
deleted file mode 100644
index 980dd90df8710cdbcb760e1ca84f1492a76fdb70..0000000000000000000000000000000000000000
--- a/paddle/operators/math/context_project.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/context_project.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template class ContextProjectFunctor<platform::CPUDeviceContext, float>;
-template class ContextProjectFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/context_project.cu b/paddle/operators/math/context_project.cu
deleted file mode 100644
index 934e3df645916013b4d1fe5eb4a19be924c914d5..0000000000000000000000000000000000000000
--- a/paddle/operators/math/context_project.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-
-#include "paddle/operators/math/context_project.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template class ContextProjectFunctor<platform::CUDADeviceContext, float>;
-template class ContextProjectFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/context_project.h b/paddle/operators/math/context_project.h
deleted file mode 100644
index 218de9fb9564fa0fa30227a6710f57bb89f1a2c8..0000000000000000000000000000000000000000
--- a/paddle/operators/math/context_project.h
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/operators/math/im2col.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-/*
- * \brief Context projection concatenates features in adjacent time-steps in
- * a sequence. The i-th row of the output is the concatenation of
- * context_length rows of the input. The context_length rows are the
- * consecutive rows from the i+shift_start row.
- * ContextProjectGradFunctor is the inverse process of ContextProjectFunctor.
- *
- * \param in            Input data.
- * \param Shape         The shape of Input data:
- *                        [mini-batch, input_hidden_size].
- *
- * \param padding_data  Padding data.
- * \param Shape         The shape of Padding data:
- *                        [up_pad + down_pad, input_hidden_size].
- *
- * \param col           Col data.
- * \param Shape         The shape of Col data:
- *                        [mini-batch, context_length * input_hidden_size].
- *
- * For a mini-batch of 2 variable lengths sentences, containing 3, and 1
- * time-steps:
- *
- * Assumed input (X) is a [4, M, N] float LoDTensor, and X->lod()[0] = [0, 3,
- * 4].
- * Besides, for the sake of simplicity, we assume M=1 and N=2.
- *
- * X = [[a1, a2;
- *       b1, b2;
- *       c1, c2]
- *      [d1, d2]]
- *
- * This is to say that input (X) has 4 words and the dimension of each word
- * representation is 2.
- *
- * - Case1:
- *   If context_start is -1 and padding_trainable is false, we use zero to pad
- *   instead of learned weight to pad,
- *   and the context_length is 3, the output (Out) is:
- *
- *   Out =[[0,  0,  a1, a2, b1, b2;
- *          a1, a2, b1, b2, c1, c2;
- *          b1, b2, c1, c2, 0,  0 ]
- *          [0,  0, d1, d2, 0,  0 ]]
- *
- * - Case2:
- *   If context_start is -1 and padding_trainable is true, we use learned weight
- *   to pad,
- *   and the context_length is 3, the output (Out) is:
- *
- *   Out = [[w1, w2, a1, a2, b1, b2;
- *           a1, a2, b1, b2, c1, c2;
- *           b1, b2, c1, c2, w3, w4]
- *          [w1, w2, d1, d2, w3, w4]]
- *
- */
-
-template <typename DeviceContext, typename T>
-class ContextProjectFunctor {
- public:
-  void operator()(const DeviceContext& context, const LoDTensor& in,
-                  const Tensor& padding_data, bool padding_trainable,
-                  const int context_start, const int context_length,
-                  const int context_stride, const int up_pad,
-                  const int down_pad, Tensor* col) {
-    auto lod_level_0 = in.lod()[0];
-
-    math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, float> im2col_ocf;
-
-    std::vector<int> dilation({1, 1});
-    std::vector<int> padding({up_pad, 0, down_pad, 0});
-    std::vector<int> stride({context_stride, 1});
-
-    int input_row_begin, input_row_end;
-    int sequence_height, sequence_width;
-    sequence_width = in.dims()[1];
-
-    for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-      input_row_begin = (context_start > 0)
-                            ? static_cast<int>(lod_level_0[i]) + context_start
-                            : static_cast<int>(lod_level_0[i]);
-      input_row_end = static_cast<int>(lod_level_0[i + 1]);
-
-      Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
-                                static_cast<int>(lod_level_0[i + 1]));
-
-      sequence_height = static_cast<int>(out_t.dims()[0]);
-
-      if (input_row_begin < input_row_end) {
-        Tensor in_t = in.Slice(input_row_begin, input_row_end);
-
-        std::vector<int64_t> output_shape(
-            {sequence_height, 1, 1, context_length,
-             sequence_width});  // output_height, output_width,
-        // input_channels, filter_height, filter_width
-        out_t.Resize(framework::make_ddim(output_shape));
-
-        std::vector<int64_t> input_shape(
-            {1, input_row_end - input_row_begin,
-             sequence_width});  // input_channels, input_height, input_width
-        in_t.Resize(framework::make_ddim(input_shape));
-        im2col_ocf(context, in_t, dilation, stride, padding, &out_t);
-        out_t.Resize({sequence_height, context_length * sequence_width});
-      }
-    }
-    if (padding_trainable) {
-      for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-        Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
-                                  static_cast<int>(lod_level_0[i + 1]));
-
-        sequence_height = static_cast<int>(out_t.dims()[0]);
-
-        // add up trainable data
-        out_t.Resize({sequence_height * context_length, sequence_width});
-
-        if (up_pad > 0) {  // add up pad
-          int padding_rows = std::min(
-              up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));
-
-          for (int k = 0; k < padding_rows; ++k) {
-            int padding_size =
-                k + context_length < up_pad ? context_length : up_pad - k;
-            Tensor out_t_sub = out_t.Slice(k * context_length,
-                                           k * context_length + padding_size);
-            Tensor w_sub = padding_data.Slice(k, k + padding_size);
-            framework::Copy(w_sub, context.GetPlace(), context, &out_t_sub);
-          }
-        }
-        if (down_pad > 0) {  // add down pad
-          int down_pad_begin_row =
-              std::max(0,
-                       (sequence_height - context_start - context_length) + 1) +
-              1;
-          int padding_begin = std::max(0, context_start - sequence_height);
-          int padding_size =
-              sequence_height - context_start >= context_length
-                  ? 1
-                  : context_length - (sequence_height - context_start);
-          if (context_start >= sequence_height) padding_size = context_length;
-          int padding_idx = padding_begin;
-          for (int t = 0; t + down_pad_begin_row <= sequence_height;
-               ++t, ++padding_size) {
-            if (context_start >= sequence_height) padding_size = context_length;
-            if (padding_size > context_length) {
-              padding_size = context_length;
-              padding_idx++;
-            }
-            if (padding_begin > 0 || sequence_height == context_start)
-              padding_idx = padding_begin + t;
-
-            Tensor out_t_sub = out_t.Slice(
-                (down_pad_begin_row + t) * context_length - padding_size,
-                (down_pad_begin_row + t) * context_length);
-            Tensor w_sub = padding_data.Slice(
-                up_pad + padding_idx, up_pad + padding_idx + padding_size);
-            framework::Copy(w_sub, context.GetPlace(), context, &out_t_sub);
-          }
-        }
-        out_t.Resize({sequence_height, context_length * sequence_width});
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ContextProjectGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const LoDTensor& in,
-                  bool padding_trainable, const int context_start,
-                  const int context_length, const int context_stride,
-                  const int up_pad, const int down_pad, bool pad_grad,
-                  bool input_grad, Tensor* padding_data, Tensor* col) {
-    auto lod_level_0 = in.lod()[0];
-
-    math::Col2ImFunctor<math::ColFormat::kOCF, DeviceContext, float> col2im_ocf;
-
-    std::vector<int> dilation({1, 1});
-    std::vector<int> padding({up_pad, 0, down_pad, 0});
-    std::vector<int> stride({context_stride, 1});
-
-    int input_row_begin, input_row_end;
-    int sequence_height, sequence_width;
-    sequence_width = in.dims()[1];
-
-    if (input_grad) {
-      for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-        input_row_begin = (context_start > 0)
-                              ? static_cast<int>(lod_level_0[i]) + context_start
-                              : static_cast<int>(lod_level_0[i]);
-        input_row_end = static_cast<int>(lod_level_0[i + 1]);
-
-        Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
-                                  static_cast<int>(lod_level_0[i + 1]));
-
-        sequence_height = static_cast<int>(out_t.dims()[0]);
-
-        if (input_row_begin < input_row_end) {
-          Tensor in_t = in.Slice(input_row_begin, input_row_end);
-
-          std::vector<int64_t> output_shape(
-              {sequence_height, 1, 1, context_length,
-               sequence_width});  // output_height, output_width,
-          // input_channels, filter_height, filter_width
-          out_t.Resize(framework::make_ddim(output_shape));
-
-          std::vector<int64_t> input_shape(
-              {1, input_row_end - input_row_begin,
-               sequence_width});  // input_channels, input_height, input_width
-          in_t.Resize(framework::make_ddim(input_shape));
-
-          col2im_ocf(context, out_t, dilation, stride, padding, &in_t);
-          out_t.Resize({sequence_height, context_length * sequence_width});
-        }
-      }
-    }
-    if (pad_grad) {
-      if (padding_trainable) {
-        for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-          Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
-                                    static_cast<int>(lod_level_0[i + 1]));
-
-          sequence_height = static_cast<int>(out_t.dims()[0]);
-          out_t.Resize({sequence_height * context_length, sequence_width});
-
-          if (up_pad > 0) {
-            int padding_rows = std::min(
-                up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));
-
-            for (int k = 0; k < padding_rows; ++k) {
-              int padding_size =
-                  k + context_length < up_pad ? context_length : up_pad - k;
-              Tensor out_t_sub = out_t.Slice(k * context_length,
-                                             k * context_length + padding_size);
-              Tensor w_sub = padding_data->Slice(k, k + padding_size);
-              axpy<DeviceContext, T>(context, w_sub.numel(), static_cast<T>(1),
-                                     out_t_sub.data<T>(), w_sub.data<T>());
-            }
-          }
-          if (down_pad > 0) {
-            int down_pad_begin_row =
-                std::max(
-                    0, (sequence_height - context_start - context_length) + 1) +
-                1;
-            int padding_begin = std::max(0, context_start - sequence_height);
-            int padding_size =
-                sequence_height - context_start >= context_length
-                    ? 1
-                    : context_length - (sequence_height - context_start);
-            if (context_start >= sequence_height) padding_size = context_length;
-            int padding_idx = padding_begin;
-            for (int t = 0; t + down_pad_begin_row <= sequence_height;
-                 ++t, ++padding_size) {
-              if (context_start >= sequence_height)
-                padding_size = context_length;
-              if (padding_size > context_length) {
-                padding_size = context_length;
-                padding_idx++;
-              }
-              if (padding_begin > 0 || sequence_height == context_start)
-                padding_idx = padding_begin + t;
-
-              Tensor out_t_sub = out_t.Slice(
-                  (down_pad_begin_row + t) * context_length - padding_size,
-                  (down_pad_begin_row + t) * context_length);
-              Tensor w_sub = padding_data->Slice(
-                  up_pad + padding_idx, up_pad + padding_idx + padding_size);
-              axpy<DeviceContext, T>(context, w_sub.numel(), static_cast<T>(1),
-                                     out_t_sub.data<T>(), w_sub.data<T>());
-            }
-          }
-          out_t.Resize({sequence_height, context_length * sequence_width});
-        }
-      }
-    }
-  }
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/cos_sim_functor.cc b/paddle/operators/math/cos_sim_functor.cc
deleted file mode 100644
index 6af9f0fcd967b4e8e9e338c155d5a8ee2866fbfa..0000000000000000000000000000000000000000
--- a/paddle/operators/math/cos_sim_functor.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/cos_sim_functor.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-struct CosSimDyFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx, const T* x_norm,
-                  const T* y_norm, const T* x, const T* y, const T* z,
-                  const T* dz, const size_t rows, const size_t cols,
-                  T* dy) const {
-    for (size_t row_id = 0; row_id < rows; ++row_id) {
-      auto xy_norm_prod = x_norm[row_id] * y_norm[0];
-      auto dz_data = dz[row_id];
-      auto z_data = z[row_id];
-      auto* x_data = x + cols * row_id;
-      auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
-
-      auto y_norm_square = y_norm[0] * y_norm[0];
-      auto reciprocal_y_norm_square = 1 / y_norm_square;
-      for (size_t i = 0; i < cols; ++i) {
-        dy[i] += dz_data * (x_data[i] * reciprocal_xy_norm_prod -
-                            z_data * y[i] * reciprocal_y_norm_square);
-      }
-    }
-  }
-};
-
-template struct CosSimDyFunctor<platform::CPUDeviceContext, float>;
-template struct CosSimDyFunctor<platform::CPUDeviceContext, double>;
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/cos_sim_functor.cu b/paddle/operators/math/cos_sim_functor.cu
deleted file mode 100644
index 6eb0a4ea4c5b86f84c93b97615255adf55e9e042..0000000000000000000000000000000000000000
--- a/paddle/operators/math/cos_sim_functor.cu
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/cos_sim_functor.h"
-#include "paddle/platform/cuda_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-__global__ void CosSimDyKernel(const T* x_norm, const T* y_norm, const T* x,
-                               const T* y, const T* z, const T* dz,
-                               const size_t rows, const size_t cols, T* dy) {
-  int grid_size = blockDim.x * gridDim.x;
-  T y_norm_data = y_norm[0];
-  for (int row_id = blockIdx.x * blockDim.x + threadIdx.x; row_id < rows;
-       row_id += grid_size) {
-    T xy_norm_prod = x_norm[row_id] * y_norm_data;
-    T dz_data = dz[row_id];
-    T z_data = z[row_id];
-    const T* x_data = x + cols * row_id;
-    T reciprocal_xy_norm_prod = 1 / xy_norm_prod;
-
-    T y_norm_square = y_norm_data * y_norm_data;
-    T reciprocal_y_norm_square = 1 / y_norm_square;
-    for (size_t i = 0; i < cols; ++i) {
-      T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod -
-                             z_data * y[i] * reciprocal_y_norm_square);
-      platform::CudaAtomicAdd(dy + i, dy_data);
-    }
-  }
-}
-
-template <typename T>
-struct CosSimDyFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx, const T* x_norm,
-                  const T* y_norm, const T* x, const T* y, const T* z,
-                  const T* dz, const size_t rows, const size_t cols,
-                  T* dy) const {
-    const int block_size = 512;
-    dim3 threads(block_size, 1);
-    dim3 grid(1, (rows + block_size - 1) / block_size);
-    CosSimDyKernel<T><<<grid, threads, 0, ctx.stream()>>>(
-        x_norm, y_norm, x, y, z, dz, rows, cols, dy);
-  }
-};
-
-template struct CosSimDyFunctor<platform::CUDADeviceContext, float>;
-template struct CosSimDyFunctor<platform::CUDADeviceContext, double>;
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/cos_sim_functor.h b/paddle/operators/math/cos_sim_functor.h
deleted file mode 100644
index aae8ab5b7a937c016e8a45e34b22aba7a1df3066..0000000000000000000000000000000000000000
--- a/paddle/operators/math/cos_sim_functor.h
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <stdlib.h>
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T, bool same_row>
-struct CosSimFunctor {
-  CosSimFunctor(const T* x, const T* y, T* x_norm, T* y_norm, T* z, int cols)
-      : x_norm_(x_norm),
-        y_norm_(y_norm),
-        x_(x),
-        y_(y),
-        z_(z),
-        cols_(static_cast<size_t>(cols)) {}
-
-  inline HOSTDEVICE void operator()(size_t row_id) const {
-    auto* x = x_ + cols_ * row_id;
-    T xx = 0, xy = 0, yy = 0;
-    if (same_row) {
-      auto* y = y_ + cols_ * row_id;
-      T tep_x, tep_y;
-      for (size_t i = 0; i < cols_; ++i) {
-        tep_x = x[i];
-        tep_y = y[i];
-        xx += tep_x * tep_x;
-        yy += tep_y * tep_y;
-        xy += tep_x * tep_y;
-      }
-      xx = sqrt(xx);
-      yy = sqrt(yy);
-      y_norm_[row_id] = yy;
-      x_norm_[row_id] = xx;
-      z_[row_id] = xy / (xx * yy);
-    } else {  // This can be wrote in a better way.
-      T tep_x, tep_y;
-      for (size_t i = 0; i < cols_; ++i) {
-        tep_x = x[i];
-        tep_y = y_[i];
-        xx += tep_x * tep_x;
-        yy += tep_y * tep_y;
-        xy += tep_x * tep_y;
-      }
-      xx = sqrt(xx);
-      yy = sqrt(yy);
-      if (row_id == 0) y_norm_[0] = yy;
-      x_norm_[row_id] = xx;
-      z_[row_id] = xy / (xx * yy);
-    }
-  }
-
-  T* x_norm_;
-  T* y_norm_;
-  const T* x_;
-  const T* y_;
-  T* z_;
-  const size_t cols_;
-};
-
-template <typename T>
-struct CosSimGradFunctor {
-  CosSimGradFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y,
-                    const T* z, const T* dz, T* dx, int cols)
-      : x_norm_(x_norm),
-        y_norm_(y_norm),
-        x_(x),
-        y_(y),
-        z_(z),
-        dz_(dz),
-        dx_(dx),
-        cols_(static_cast<size_t>(cols)) {}
-
-  inline HOSTDEVICE void operator()(size_t row_id) const {
-    auto x_norm_square = x_norm_[row_id] * x_norm_[row_id];
-    auto xy_norm_prod = x_norm_[row_id] * y_norm_[row_id];
-    auto dz = dz_[row_id];
-    auto z = z_[row_id];
-
-    auto* dx = dx_ + cols_ * row_id;
-    auto* x = x_ + cols_ * row_id;
-    auto* y = y_ + cols_ * row_id;
-
-    auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
-    auto reciprocal_x_norm_square = 1 / x_norm_square;
-    for (size_t i = 0; i < cols_; ++i) {
-      dx[i] = dz * (y[i] * reciprocal_xy_norm_prod -
-                    z * x[i] * reciprocal_x_norm_square);
-    }
-  }
-
-  const T* x_norm_;
-  const T* y_norm_;
-  const T* x_;
-  const T* y_;
-  const T* z_;
-  const T* dz_;
-  T* dx_;
-  const size_t cols_;
-};
-
-template <typename T>
-struct CosSimDxFunctor {
-  CosSimDxFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y,
-                  const T* z, const T* dz, T* dx, int cols)
-      : x_norm_(x_norm),
-        y_norm_(y_norm),
-        x_(x),
-        y_(y),
-        z_(z),
-        dz_(dz),
-        dx_(dx),
-        cols_(static_cast<size_t>(cols)) {}
-
-  inline HOSTDEVICE void operator()(size_t row_id) const {
-    auto xy_norm_prod = x_norm_[row_id] * y_norm_[0];
-    auto dz = dz_[row_id];
-    auto z = z_[row_id];
-    auto* x = x_ + cols_ * row_id;
-    auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
-    auto x_norm_square = x_norm_[row_id] * x_norm_[row_id];
-    auto* dx = dx_ + cols_ * row_id;
-    auto reciprocal_x_norm_square = 1 / x_norm_square;
-
-    for (size_t i = 0; i < cols_; ++i) {
-      dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod -
-                    z * x[i] * reciprocal_x_norm_square);
-    }
-  }
-  const T* x_norm_;
-  const T* y_norm_;
-  const T* x_;
-  const T* y_;
-  const T* z_;
-  const T* dz_;
-  T* dx_;
-  const size_t cols_;
-};
-
-template <typename DeviceContext, typename T>
-struct CosSimDyFunctor {
-  void operator()(const DeviceContext& ctx, const T* x_norm, const T* y_norm,
-                  const T* x, const T* y, const T* z, const T* dz,
-                  const size_t rows, const size_t cols, T* dy) const;
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/cross_entropy.cc b/paddle/operators/math/cross_entropy.cc
deleted file mode 100644
index d9cb016fb440b6b2fe1d222812215feb5970dc4f..0000000000000000000000000000000000000000
--- a/paddle/operators/math/cross_entropy.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/cross_entropy.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out,
-                  const framework::Tensor* prob,
-                  const framework::Tensor* labels, const bool softLabel) {
-    const int batch_size = prob->dims()[0];
-    if (softLabel) {
-      auto in = EigenMatrix<T>::From(*prob);
-      auto lbl = EigenMatrix<T>::From(*labels);
-      auto loss = EigenMatrix<T>::From(*out);
-
-      loss.device(*ctx.eigen_device()) =
-          -((lbl * in.log().unaryExpr(math::TolerableValue<T>()))
-                .sum(Eigen::DSizes<int, 1>(1))
-                .reshape(Eigen::DSizes<int, 2>(batch_size, 1)));
-    } else {
-      const int class_num = prob->dims()[1];
-      const T* prob_data = prob->data<T>();
-      T* loss_data = out->data<T>();
-
-      const int64_t* label_data = labels->data<int64_t>();
-      for (int i = 0; i < batch_size; ++i) {
-        int index = i * class_num + label_data[i];
-        loss_data[i] = -math::TolerableValue<T>()(std::log(prob_data[index]));
-      }
-    }
-  }
-};
-
-template class CrossEntropyFunctor<platform::CPUDeviceContext, float>;
-template class CrossEntropyFunctor<platform::CPUDeviceContext, double>;
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/cross_entropy.cu b/paddle/operators/math/cross_entropy.cu
deleted file mode 100644
index 16c9e7b28ec8d453492455c8d620ba9edf130a07..0000000000000000000000000000000000000000
--- a/paddle/operators/math/cross_entropy.cu
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/cross_entropy.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-namespace {
-template <typename T>
-__global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
-                                   const int N, const int D) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
-    Y[i] = -math::TolerableValue<T>()(log(X[i * D + label[i]]));
-  }
-}
-
-template <typename T>
-__device__ __forceinline__ T sum_single_warp(T val) {
-  val += __shfl_down(val, 16);
-  val += __shfl_down(val, 8);
-  val += __shfl_down(val, 4);
-  val += __shfl_down(val, 2);
-  val += __shfl_down(val, 1);
-  return val;
-}
-
-// CUDA do not support dynamic arrary in template
-// https://stackoverflow.com/questions/20497209
-template <typename T>
-struct SharedMemory {
-  // Ensure that we won't compile any un-specialized types
-  __device__ T* GetPointer() { return NULL; }
-};
-
-template <>
-struct SharedMemory<float> {
-  __device__ float* GetPointer() {
-    extern __shared__ float s_float[];
-    return s_float;
-  }
-};
-
-template <>
-struct SharedMemory<double> {
-  __device__ double* GetPointer() {
-    extern __shared__ double s_double[];
-    return s_double;
-  }
-};
-
-template <typename T>
-__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
-                                       const int class_num) {
-  int tid = threadIdx.x;
-  SharedMemory<T> d_sum_shared;
-  T* d_sum = d_sum_shared.GetPointer();
-  d_sum[tid] = 0;
-
-  int cur_idx = tid;
-  int next_idx = blockIdx.x * class_num + tid;
-  while (cur_idx < class_num) {
-    d_sum[tid] +=
-        math::TolerableValue<T>()(std::log(X[next_idx])) * label[next_idx];
-    next_idx += blockDim.x;
-    cur_idx += blockDim.x;
-  }
-  __syncthreads();
-
-  for (unsigned int stride = blockDim.x >> 1; stride >= 32; stride >>= 1) {
-    if (tid < stride) d_sum[tid] += d_sum[tid + stride];
-    __syncthreads();
-  }
-
-  T val = d_sum[tid];
-  val = sum_single_warp<T>(val);
-  if (tid == 0) Y[blockIdx.x] = -val;
-}
-}  // namespace
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  framework::Tensor* out, const framework::Tensor* prob,
-                  const framework::Tensor* labels, bool softLabel) {
-    const T* prob_data = prob->data<T>();
-    T* loss_data = out->mutable_data<T>(ctx.GetPlace());
-
-    int batch_size = prob->dims()[0];
-    int class_num = prob->dims()[1];
-
-    if (softLabel) {
-      const T* label_data = labels->data<T>();
-      int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num)));
-
-      SoftCrossEntropyKernel<T><<<
-          batch_size, block, block * sizeof(T),
-          reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-          loss_data, prob_data, label_data, class_num);
-    } else {
-      const int64_t* label_data = labels->data<int64_t>();
-      int block = 512;
-      int grid = (batch_size + block - 1) / block;
-      CrossEntropyKernel<T><<<grid, block, 0, ctx.stream()>>>(
-          loss_data, prob_data, label_data, batch_size, class_num);
-    }
-  }
-};
-
-template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
-template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/cross_entropy.h b/paddle/operators/math/cross_entropy.h
deleted file mode 100644
index b3b6d767a8b8f59e3c75e72ac6c98653a8e1c3a4..0000000000000000000000000000000000000000
--- a/paddle/operators/math/cross_entropy.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-struct TolerableValue {
-  HOSTDEVICE T operator()(const T& x) const {
-    PADDLE_ASSERT(std::is_floating_point<T>::value);
-    const T kApproInf = 1e20;
-
-    if (x == INFINITY) return kApproInf;
-    if (x == -INFINITY) return -kApproInf;
-    return x;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CrossEntropyFunctor {
- public:
-  void operator()(const DeviceContext& context, framework::Tensor* out,
-                  const framework::Tensor* prob,
-                  const framework::Tensor* labels, const bool softLabel);
-};
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/depthwise_conv.cu b/paddle/operators/math/depthwise_conv.cu
deleted file mode 100644
index b212e78208355866516211d276cb8046623babd7..0000000000000000000000000000000000000000
--- a/paddle/operators/math/depthwise_conv.cu
+++ /dev/null
@@ -1,311 +0,0 @@
-/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/depthwise_conv.h"
-#include "paddle/platform/cuda_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-// A Cuda kernel to compute the depthwise convolution forward pass
-// in NCHW format.
-template <typename T>
-__global__ void KernelDepthwiseConv(
-    const int nthreads, const T* const input_data, const T* const filter_data,
-    const int batch_size, const int output_channels, const int output_height,
-    const int output_width, const int input_channels, const int input_height,
-    const int input_width, const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, T* const output_data) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if (index < nthreads) {
-    const int batch = index / output_channels / output_height / output_width;
-    const int c_out = (index / output_height / output_width) % output_channels;
-    const int h_out = (index / output_width) % output_height;
-    const int w_out = index % output_width;
-
-    const int c_in = c_out / filter_multiplier;
-    const T* weight = filter_data + c_out * filter_height * filter_width;
-    T value = 0;
-    const int h_in_start = -padding_height + h_out * stride_height;
-    const int w_in_start = -padding_width + w_out * stride_width;
-    const int h_in_end = h_in_start + filter_height;
-    const int w_in_end = w_in_start + filter_width;
-
-    const int in_offset =
-        ((batch * input_channels + c_in) * input_height) * input_width;
-
-    const int h_end = h_in_end < input_height ? h_in_end : input_height;
-    const int w_end = w_in_end < input_width ? w_in_end : input_width;
-    const int h_start = h_in_start > 0 ? h_in_start : 0;
-    const int w_start = w_in_start > 0 ? w_in_start : 0;
-
-    for (int h_in = h_start; h_in < h_end; h_in++) {
-      for (int w_in = w_start; w_in < w_end; w_in++) {
-        const int offset = in_offset + h_in * input_width + w_in;
-        value +=
-            weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] *
-            input_data[offset];
-      }
-    }
-    output_data[index] = value;
-  }
-}
-
-// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
-template <typename T>
-__global__ void KernelDepthwiseConvInputGrad(
-    const int nthreads, const T* const output_grad_data,
-    const T* const filter_data, const int batch_size, const int output_channels,
-    const int output_height, const int output_width, const int input_channels,
-    const int input_height, const int input_width, const int filter_multiplier,
-    const int filter_height, const int filter_width, const int stride_height,
-    const int stride_width, const int padding_height, const int padding_width,
-    T* const input_grad_data) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    const int batch = index / input_channels / input_height / input_width;
-    const int c_in = (index / input_height / input_width) % input_channels;
-    const int h_in = (index / input_width) % input_height;
-    const int w_in = index % input_width;
-
-    const int c_out_start = c_in * filter_multiplier;
-
-    int h_out_start =
-        (h_in - filter_height + padding_height + stride_height) / stride_height;
-    h_out_start = 0 > h_out_start ? 0 : h_out_start;
-
-    int h_out_end = (h_in + padding_height) / stride_height;
-    h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end;
-
-    int w_out_start =
-        (w_in - filter_width + padding_width + stride_width) / stride_width;
-    w_out_start = 0 > w_out_start ? 0 : w_out_start;
-
-    int w_out_end = (w_in + padding_width) / stride_width;
-    w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end;
-
-    T value = 0;
-
-    for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier;
-         c_out++) {
-      for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
-        const int filter_h = h_in + padding_height - h_out * stride_height;
-        for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
-          const int filter_w = w_in + padding_width - w_out * stride_width;
-          const int filter_offset = c_out * filter_height * filter_width +
-                                    filter_h * filter_width + filter_w;
-          const int output_grad_offset =
-              ((batch * output_channels + c_out) * output_height + h_out) *
-                  output_width +
-              w_out;
-          value +=
-              output_grad_data[output_grad_offset] * filter_data[filter_offset];
-        }
-      }
-    }
-    input_grad_data[index] += value;
-  }
-}
-
-// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
-template <typename T>
-__global__ void KernelDepthwiseConvFilterGrad(
-    const int nthreads, const T* const output_grad_data,
-    const T* const input_data, const int num, const int output_channels,
-    const int output_height, const int output_width, const int input_channels,
-    const int input_height, const int input_width, const int filter_multiplier,
-    const int filter_height, const int filter_width, const int stride_height,
-    const int stride_width, const int padding_height, const int padding_width,
-    T* const filter_grad_data) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    const int w_out = index % output_width;
-    const int h_out = (index / output_width) % output_height;
-    const int c_out = (index / output_width / output_height) % output_channels;
-    const int batch = (index / output_width / output_height / output_channels);
-    const int c_in = c_out / filter_multiplier;
-    const int h_in_start = -padding_height + h_out * stride_height;
-    const int w_in_start = -padding_width + w_out * stride_width;
-    const int h_in_end =
-        -padding_height + h_out * stride_height + filter_height;
-    const int w_in_end = -padding_width + w_out * stride_width + filter_width;
-    const int in_offset =
-        (batch * input_channels + c_in) * input_height * input_width;
-
-    T* addr_offset = filter_grad_data + c_out * filter_height * filter_width;
-    const int h_end = h_in_end < input_height ? h_in_end : input_height;
-    const int w_end = w_in_end < input_width ? w_in_end : input_width;
-    const int h_start = h_in_start > 0 ? h_in_start : 0;
-    const int w_start = w_in_start > 0 ? w_in_start : 0;
-
-    for (int h_in = h_start; h_in < h_end; h_in++) {
-      for (int w_in = w_start; w_in < w_end; w_in++) {
-        const int offset = in_offset + h_in * input_width + w_in;
-        const T diff_temp = output_grad_data[index] * input_data[offset];
-        T* addr = addr_offset + (h_in - h_in_start) * filter_width +
-                  (w_in - w_in_start);
-        paddle::platform::CudaAtomicAdd(addr, diff_temp);
-      }
-    }
-  }
-}
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <class T>
-class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& filter,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
-    const int ksize_height = filter.dims()[2];
-    const int ksize_width = filter.dims()[3];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-
-    const T* input_data = input.data<T>();
-    const T* filter_data = filter.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-
-    int nthreads = batch_size * output_channels * output_height * output_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelDepthwiseConv<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, filter_data, batch_size, output_channels,
-        output_height, output_width, input_channels, input_height, input_width,
-        output_channels / input_channels, ksize_height, ksize_width,
-        stride_height, stride_width, padding_height, padding_width,
-        output_data);
-  }
-};
-
-template <typename T>
-class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& filter,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output_grad.dims()[1];
-    const int output_height = output_grad.dims()[2];
-    const int output_width = output_grad.dims()[3];
-    const int ksize_height = filter.dims()[2];
-    const int ksize_width = filter.dims()[3];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-
-    const T* filter_data = filter.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    int nthreads = batch_size * input_channels * input_height * input_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelDepthwiseConvInputGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, output_grad_data, filter_data, batch_size, output_channels,
-        output_height, output_width, input_channels, input_height, input_width,
-        output_channels / input_channels, ksize_height, ksize_width,
-        stride_height, stride_width, padding_height, padding_width,
-        input_grad_data);
-  }
-};
-
-template <typename T>
-class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* filter_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output_grad.dims()[1];
-    const int output_height = output_grad.dims()[2];
-    const int output_width = output_grad.dims()[3];
-    const int ksize_height = filter_grad->dims()[2];
-    const int ksize_width = filter_grad->dims()[3];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-
-    const T* input_data = input.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());
-
-    int nthreads = batch_size * output_channels * output_height * output_width;
-
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelDepthwiseConvFilterGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, output_grad_data, input_data, batch_size, output_channels,
-        output_height, output_width, input_channels, input_height, input_width,
-        output_channels / input_channels, ksize_height, ksize_width,
-        stride_height, stride_width, padding_height, padding_width,
-        filter_grad_data);
-  }
-};
-
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, float>;
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, double>;
-
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
-                                             float>;
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
-                                             double>;
-
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              float>;
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/depthwise_conv.h b/paddle/operators/math/depthwise_conv.h
deleted file mode 100644
index 4708920bb42db90d84fda0c6a1039991cb79e80d..0000000000000000000000000000000000000000
--- a/paddle/operators/math/depthwise_conv.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/*
- * \brief Compute the depthwise convolution which include
- * forward process and backpropagation process
- */
-template <typename DeviceContext, typename T>
-class DepthwiseConvFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& filter,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output);
-};
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvInputGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& filter,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* input_grad);
-};
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvFilterGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* filter_grad);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/detail/activation_functions.h b/paddle/operators/math/detail/activation_functions.h
deleted file mode 100644
index 585a0123437a39c2b610306b18fe0a970c0ed072..0000000000000000000000000000000000000000
--- a/paddle/operators/math/detail/activation_functions.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/hostdevice.h"
-
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace detail {
-
-#define SIGMOID_THRESHOLD_MIN -40.0
-#define SIGMOID_THRESHOLD_MAX 13.0
-#define EXP_MAX_INPUT 40.0
-
-enum ActivationType {
-  kSigmoid,
-  kReLU,
-  kTanh,
-  kIdentity,
-};
-
-inline ActivationType GetActivationType(const std::string &type) {
-  if (type == "sigmoid") {
-    return ActivationType::kSigmoid;
-  } else if (type == "relu") {
-    return ActivationType::kReLU;
-  } else if (type == "tanh") {
-    return ActivationType::kTanh;
-  } else if (type == "identity" || type == "") {
-    return ActivationType::kIdentity;
-  }
-  PADDLE_THROW("Not support type %s.", type);
-}
-
-namespace forward {
-
-template <typename T>
-DEVICE T Identity(const T a) {
-  return a;
-}
-
-template <typename T>
-DEVICE T Relu(const T a) {
-  return a > static_cast<T>(0.0) ? a : static_cast<T>(0.0);
-}
-
-template <typename T>
-DEVICE T Sigmoid(const T a) {
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  T tmp = (a < min) ? min : ((a > max) ? max : a);
-  return static_cast<T>(1.0) / (static_cast<T>(1.0) + exp(-tmp));
-}
-
-template <typename T>
-DEVICE T Tanh(const T a) {
-  T tmp = -2.0 * a;
-  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-  return (2.0 / (1.0 + exp(tmp))) - 1.0;
-}
-
-}  // namespace forward
-
-namespace backward {
-
-template <typename T>
-DEVICE T Identity(const T a, const T b) {
-  return a;
-}
-
-template <typename T>
-DEVICE T Relu(const T a, const T b) {
-  return a * (b > 0.0 ? 1.0 : 0.0);
-}
-
-template <typename T>
-DEVICE T Sigmoid(const T a, const T b) {
-  return a * b * (1.0 - b);
-}
-
-template <typename T>
-DEVICE T Tanh(const T a, const T b) {
-  return a * (1.0 - b * b);
-}
-
-}  // namespace backward
-
-template <typename T>
-struct Active {
-  typedef T (*Act)(T);
-  typedef T (*ActGrad)(T, T);
-};
-
-static DEVICE Active<float>::Act kActFloat[] = {
-    &forward::Sigmoid<float>, &forward::Relu<float>, &forward::Tanh<float>,
-    &forward::Identity<float>};
-
-static DEVICE Active<float>::ActGrad kActGradFloat[] = {
-    &backward::Sigmoid<float>, &backward::Relu<float>, &backward::Tanh<float>,
-    &backward::Identity<float>};
-
-static DEVICE Active<double>::Act kActDouble[] = {
-    &forward::Sigmoid<double>, &forward::Relu<double>, &forward::Tanh<double>,
-    &forward::Identity<double>};
-
-static DEVICE Active<double>::ActGrad kActGradDouble[] = {
-    &backward::Sigmoid<double>, &backward::Relu<double>,
-    &backward::Tanh<double>, &backward::Identity<double>};
-
-namespace forward {
-inline DEVICE float activation(float a, int index) {
-  return kActFloat[index](a);
-}
-
-inline DEVICE double activation(double a, int index) {
-  return kActDouble[index](a);
-}
-
-}  // namespace forward
-
-namespace backward {
-inline DEVICE float activation(float a, float b, int index) {
-  return kActGradFloat[index](a, b);
-}
-
-inline DEVICE double activation(double a, double b, int index) {
-  return kActGradDouble[index](a, b);
-}
-}  // namespace backward
-
-#ifdef __AVX__
-namespace forward {
-namespace avx {
-__m256 Relu(const __m256 a);
-__m256 Sigmoid(const __m256 a);
-__m256 Tanh(const __m256 a);
-__m256 Identity(const __m256 a);
-}  // namespace avx
-}  // namespace forward
-
-namespace backward {
-namespace avx {
-__m256 Relu(const __m256 a, const __m256 b);
-__m256 Sigmoid(const __m256 a, const __m256 b);
-__m256 Tanh(const __m256 a, const __m256 b);
-__m256 Identity(const __m256 a, const __m256 b);
-}  // namespace avx
-}  // namespace backward
-
-static Active<__m256>::Act kActAvx[] = {
-    &forward::avx::Sigmoid, &forward::avx::Relu, &forward::avx::Tanh,
-    &forward::avx::Identity};
-
-static Active<__m256>::ActGrad kActGradAvx[] = {
-    &backward::avx::Sigmoid, &backward::avx::Relu, &backward::avx::Tanh,
-    &backward::avx::Identity};
-
-namespace forward {
-inline __m256 activation(__m256 a, int index) { return kActAvx[index](a); }
-}  // namespace forward
-
-namespace backward {
-inline __m256 activation(__m256 a, __m256 b, int index) {
-  return kActGradAvx[index](a, b);
-}
-}  // namespace backward
-
-#endif
-
-}  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/detail/avx_functions.cc b/paddle/operators/math/detail/avx_functions.cc
deleted file mode 100644
index 921364788cd23e265fa0ca027bf1af3f81604489..0000000000000000000000000000000000000000
--- a/paddle/operators/math/detail/avx_functions.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef __AVX__
-
-#include <immintrin.h>
-#include "paddle/operators/math/detail/activation_functions.h"
-// TODO(qingqing) refine this dependence
-#include "paddle/cuda/src/avx_mathfun.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace detail {
-
-__m256 Exp(__m256 a) { return exp256_ps(a); }
-
-namespace forward {
-namespace avx {
-__m256 Relu(const __m256 a) {
-  __m256 tmp = _mm256_set1_ps(0.0f);
-  return _mm256_max_ps(a, tmp);
-}
-
-__m256 Sigmoid(const __m256 a) {
-  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
-  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
-  __m256 tmp = _mm256_max_ps(a, min);
-  tmp = _mm256_min_ps(tmp, max);
-  tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
-  tmp = Exp(tmp);
-  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
-  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
-  return tmp;
-}
-
-__m256 Tanh(const __m256 a) {
-  __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
-  __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
-  tmp = _mm256_min_ps(tmp, max);
-  tmp = Exp(tmp);
-  return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f),
-                                     _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)),
-                       _mm256_set1_ps(1.0f));
-}
-
-__m256 Identity(const __m256 a) { return a; }
-
-}  // namespace avx
-}  // namespace forward
-
-namespace backward {
-namespace avx {
-__m256 Relu(const __m256 a, const __m256 b) {
-  return _mm256_mul_ps(
-      a, _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
-                       _mm256_set1_ps(1.0f)));
-}
-
-__m256 Sigmoid(const __m256 a, const __m256 b) {
-  return _mm256_mul_ps(_mm256_mul_ps(a, b),
-                       _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
-}
-
-__m256 Tanh(const __m256 a, const __m256 b) {
-  return _mm256_mul_ps(
-      a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
-}
-
-__m256 Identity(const __m256 a, const __m256 b) { return a; }
-}  // namespace avx
-}  // namespace backward
-
-}  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
-
-#endif
diff --git a/paddle/operators/math/detail/gru_cpu_kernel.h b/paddle/operators/math/detail/gru_cpu_kernel.h
deleted file mode 100644
index a61b232f4275d93cae1d9a71d49a779216c3555b..0000000000000000000000000000000000000000
--- a/paddle/operators/math/detail/gru_cpu_kernel.h
+++ /dev/null
@@ -1,426 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <type_traits>
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/operators/math/gru_compute.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace detail {
-
-#ifndef __NVCC__
-
-template <class OpResetOutput, typename T>
-void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
-                                       T *gate_value, T *reset_output_value,
-                                       T *prev_output_value, int frame_size,
-                                       ActivationType active_gate) {
-  T r_value_update_gate;
-  T r_value_reset_gate;
-  T r_value_reset_output;
-  T r_prev_out = 0;
-  T *update_gate = gate_value;
-  T *reset_gate = gate_value + frame_size;
-
-  for (int i = 0; i < frame_size; i++) {
-    r_value_update_gate = update_gate[i];
-    r_value_reset_gate = reset_gate[i];
-    if (prev_output_value) {
-      r_prev_out = prev_output_value[i];
-    }
-
-    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
-                    r_value_reset_output, active_gate);
-
-    update_gate[i] = r_value_update_gate;
-    reset_gate[i] = r_value_reset_gate;
-    reset_output_value[i] = r_value_reset_output;
-  }
-}
-
-template <class OpFinalOutput, typename T>
-void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
-                                       T *gate_value, T *prev_output_value,
-                                       T *output_value, int frame_size,
-                                       ActivationType active_node) {
-  T r_value_update_gate;
-  T r_value_frame_state;
-  T r_prev_out = 0;
-  T r_output;
-  T *update_gate = gate_value;
-  T *frame_state = gate_value + frame_size * 2;
-
-  for (int i = 0; i < frame_size; i++) {
-    r_value_update_gate = update_gate[i];
-    r_value_frame_state = frame_state[i];
-    if (prev_output_value) {
-      r_prev_out = prev_output_value[i];
-    }
-
-    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
-                    r_output, active_node);
-
-    frame_state[i] = r_value_frame_state;
-    output_value[i] = r_output;
-  }
-}
-
-template <class OpResetOutput, typename T>
-void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
-                                     T *gate_value, T *reset_output_value,
-                                     T *prev_output_value, int frame_size,
-                                     ActivationType active_gate) {
-#ifdef __AVX__
-  __m256 r_value_update_gate;
-  __m256 r_value_reset_gate;
-  __m256 r_value_reset_output;
-  __m256 r_prev_out = _mm256_set1_ps(0.0f);
-  __m256 *update_gate = (__m256 *)gate_value;
-  __m256 *reset_gate = (__m256 *)(gate_value + frame_size);
-
-  for (int i = 0; i < frame_size / 8; i++) {
-    r_value_update_gate = update_gate[i];
-    r_value_reset_gate = reset_gate[i];
-    if (prev_output_value) {
-      r_prev_out = ((__m256 *)prev_output_value)[i];
-    }
-
-    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
-                    r_value_reset_output, active_gate);
-
-    update_gate[i] = r_value_update_gate;
-    reset_gate[i] = r_value_reset_gate;
-    ((__m256 *)reset_output_value)[i] = r_value_reset_output;
-  }
-#endif
-}
-
-template <class OpFinalOutput, typename T>
-void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
-                                     T *gate_value, T *prev_output_value,
-                                     T *output_value, int frame_size,
-                                     ActivationType active_node) {
-#ifdef __AVX__
-  __m256 r_value_update_gate;
-  __m256 r_value_frame_state;
-  __m256 r_prev_out = _mm256_set1_ps(0.0f);
-  __m256 r_output;
-  __m256 *update_gate = (__m256 *)gate_value;
-  __m256 *frame_state = (__m256 *)(gate_value + frame_size * 2);
-
-  for (int i = 0; i < frame_size / 8; i++) {
-    r_value_update_gate = update_gate[i];
-    r_value_frame_state = frame_state[i];
-    if (prev_output_value) {
-      r_prev_out = ((__m256 *)prev_output_value)[i];
-    }
-
-    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
-                    r_output, active_node);
-
-    frame_state[i] = r_value_frame_state;
-    ((__m256 *)output_value)[i] = r_output;
-  }
-#endif
-}
-
-template <class OpResetOutput, typename T>
-inline void forward_reset_output(OpResetOutput op_reset_output,
-                                 GRUMetaValue<T> value, int frame_size,
-                                 int batch_size, ActivationType active_gate) {
-  for (int b = 0; b < batch_size; b++) {
-    if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
-      hl_avx_gru_forward_reset_output(
-          op_reset_output, value.gate_value, value.reset_output_value,
-          value.prev_out_value, frame_size, active_gate);
-    } else {
-      hl_naive_gru_forward_reset_output(
-          op_reset_output, value.gate_value, value.reset_output_value,
-          value.prev_out_value, frame_size, active_gate);
-    }
-
-    value.gate_value += frame_size * 3;
-    value.reset_output_value += frame_size;
-    if (value.prev_out_value) {
-      value.prev_out_value += frame_size;
-    }
-  }
-}
-
-template <class OpFinalOutput, typename T>
-inline void forward_final_output(OpFinalOutput op_final_output,
-                                 GRUMetaValue<T> value, int frame_size,
-                                 int batch_size, ActivationType active_node) {
-  for (int b = 0; b < batch_size; b++) {
-    if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
-      hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
-                                      value.prev_out_value, value.output_value,
-                                      frame_size, active_node);
-    } else {
-      hl_naive_gru_forward_final_output(
-          op_final_output, value.gate_value, value.prev_out_value,
-          value.output_value, frame_size, active_node);
-    }
-
-    value.gate_value += frame_size * 3;
-    value.output_value += frame_size;
-    if (value.prev_out_value) {
-      value.prev_out_value += frame_size;
-    }
-  }
-}
-
-template <class OpStateGrad, typename T>
-void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
-                                      T *gate_grad, T *prev_out_value,
-                                      T *prev_out_grad, T *output_grad,
-                                      int frame_size,
-                                      ActivationType active_node) {
-  T r_update_gate_value;
-  T r_update_gate_grad;
-  T r_frame_state_value;
-  T r_frame_state_grad;
-  T r_out_grad;
-  T r_prev_out_value = 0;
-  T r_prev_out_grad = 0;
-  T *update_gate_value = gate_value;
-  T *update_gate_grad = gate_grad;
-  T *frame_state_value = gate_value + frame_size * 2;
-  T *frame_state_grad = gate_grad + frame_size * 2;
-
-  for (int i = 0; i < frame_size; i++) {
-    r_update_gate_value = update_gate_value[i];
-    r_frame_state_value = frame_state_value[i];
-    r_out_grad = output_grad[i];
-    if (prev_out_value) {
-      r_prev_out_value = prev_out_value[i];
-    }
-    if (prev_out_grad) {
-      r_prev_out_grad = prev_out_grad[i];
-    }
-
-    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
-                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
-                  r_out_grad, active_node);
-
-    update_gate_grad[i] = r_update_gate_grad;
-    frame_state_grad[i] = r_frame_state_grad;
-    if (prev_out_grad) {
-      prev_out_grad[i] = r_prev_out_grad;
-    }
-  }
-}
-
-template <class OpResetGrad, typename T>
-void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
-                                      T *gate_grad, T *prev_out_value,
-                                      T *prev_out_grad, T *reset_output_grad,
-                                      int frame_size,
-                                      ActivationType active_gate) {
-  T r_update_gate_value;
-  T r_update_gate_grad;
-  T r_reset_gate_value;
-  T r_reset_gate_grad;
-  T r_reset_output_grad = 0;
-  T r_prev_out_value = 0;
-  T r_prev_out_grad = 0;
-  T *update_gate_value = gate_value;
-  T *update_gate_grad = gate_grad;
-  T *reset_gate_value = gate_value + frame_size;
-  T *reset_gate_grad = gate_grad + frame_size;
-
-  for (int i = 0; i < frame_size; i++) {
-    r_update_gate_value = update_gate_value[i];
-    r_update_gate_grad = update_gate_grad[i];
-    r_reset_gate_value = reset_gate_value[i];
-
-    if (prev_out_value && prev_out_grad) {
-      r_reset_output_grad = reset_output_grad[i];
-    }
-    if (prev_out_value) {
-      r_prev_out_value = prev_out_value[i];
-    }
-    if (prev_out_grad) {
-      r_prev_out_grad = prev_out_grad[i];
-    }
-
-    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
-                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
-                  r_reset_output_grad, active_gate);
-
-    update_gate_grad[i] = r_update_gate_grad;
-    reset_gate_grad[i] = r_reset_gate_grad;
-    if (prev_out_grad) {
-      prev_out_grad[i] = r_prev_out_grad;
-    }
-  }
-}
-
-template <class OpStateGrad, typename T>
-void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
-                                    T *gate_grad, T *prev_out_value,
-                                    T *prev_out_grad, T *output_grad,
-                                    int frame_size,
-                                    ActivationType active_node) {
-#ifdef __AVX__
-  __m256 r_update_gate_value;
-  __m256 r_update_gate_grad;
-  __m256 r_frame_state_value;
-  __m256 r_frame_state_grad;
-  __m256 r_out_grad;
-  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
-  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
-  __m256 *update_gate_value = (__m256 *)gate_value;
-  __m256 *update_gate_grad = (__m256 *)gate_grad;
-  __m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2);
-  __m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2);
-
-  for (int i = 0; i < frame_size / 8; i++) {
-    r_update_gate_value = update_gate_value[i];
-    r_frame_state_value = frame_state_value[i];
-    r_out_grad = ((__m256 *)output_grad)[i];
-    if (prev_out_value) {
-      r_prev_out_value = ((__m256 *)prev_out_value)[i];
-    }
-    if (prev_out_grad) {
-      r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
-    }
-
-    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
-                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
-                  r_out_grad, active_node);
-
-    update_gate_grad[i] = r_update_gate_grad;
-    frame_state_grad[i] = r_frame_state_grad;
-    if (prev_out_grad) {
-      ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
-    }
-  }
-#endif
-}
-
-template <class OpResetGrad, typename T>
-void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
-                                    T *gate_grad, T *prev_out_value,
-                                    T *prev_out_grad, T *reset_output_grad,
-                                    int frame_size,
-                                    ActivationType active_gate) {
-#ifdef __AVX__
-  __m256 r_update_gate_value;
-  __m256 r_update_gate_grad;
-  __m256 r_reset_gate_value;
-  __m256 r_reset_gate_grad;
-  __m256 r_reset_output_grad = _mm256_set1_ps(0.0f);
-  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
-  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
-  __m256 *update_gate_value = (__m256 *)gate_value;
-  __m256 *update_gate_grad = (__m256 *)gate_grad;
-  __m256 *reset_gate_value = (__m256 *)(gate_value + frame_size);
-  __m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size);
-
-  for (int i = 0; i < frame_size / 8; i++) {
-    r_update_gate_value = update_gate_value[i];
-    r_update_gate_grad = update_gate_grad[i];
-    r_reset_gate_value = reset_gate_value[i];
-
-    if (prev_out_value && prev_out_grad) {
-      r_reset_output_grad = ((__m256 *)reset_output_grad)[i];
-    }
-    if (prev_out_value) {
-      r_prev_out_value = ((__m256 *)prev_out_value)[i];
-    }
-    if (prev_out_grad) {
-      r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
-    }
-
-    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
-                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
-                  r_reset_output_grad, active_gate);
-
-    update_gate_grad[i] = r_update_gate_grad;
-    reset_gate_grad[i] = r_reset_gate_grad;
-    if (prev_out_grad) {
-      ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
-    }
-  }
-#endif
-}
-
-template <class OpStateGrad, typename T>
-inline void backward_state_grad(OpStateGrad op_state_grad,
-                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
-                                int frame_size, int batch_size,
-                                ActivationType active_node) {
-  for (int b = 0; b < batch_size; b++) {
-    if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
-      hl_avx_gru_backward_state_grad(
-          op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
-          grad.prev_out_grad, grad.output_grad, frame_size, active_node);
-    } else {
-      hl_naive_gru_backward_state_grad(
-          op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
-          grad.prev_out_grad, grad.output_grad, frame_size, active_node);
-    }
-
-    value.gate_value += frame_size * 3;
-    if (value.prev_out_value) {
-      value.prev_out_value += frame_size;
-    }
-
-    grad.gate_grad += frame_size * 3;
-    grad.output_grad += frame_size;
-    if (grad.prev_out_grad) {
-      grad.prev_out_grad += frame_size;
-    }
-  }
-}
-
-template <class OpResetGrad, typename T>
-inline void backward_reset_grad(OpResetGrad op_reset_grad,
-                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
-                                int frame_size, int batch_size,
-                                ActivationType active_gate) {
-  for (int b = 0; b < batch_size; b++) {
-    if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
-      hl_avx_gru_backward_reset_grad(
-          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
-          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
-    } else {
-      hl_naive_gru_backward_reset_grad(
-          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
-          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
-    }
-
-    value.gate_value += frame_size * 3;
-    if (value.prev_out_value) {
-      value.prev_out_value += frame_size;
-    }
-
-    grad.gate_grad += frame_size * 3;
-    grad.reset_output_grad += frame_size;
-    if (grad.prev_out_grad) {
-      grad.prev_out_grad += frame_size;
-    }
-  }
-}
-
-#endif
-
-}  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/detail/gru_gpu_kernel.h b/paddle/operators/math/detail/gru_gpu_kernel.h
deleted file mode 100644
index 1783d46096858c874b27ce75760342082835b180..0000000000000000000000000000000000000000
--- a/paddle/operators/math/detail/gru_gpu_kernel.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <type_traits>
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/operators/math/gru_compute.h"
-#include "paddle/platform/cuda_helper.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace detail {
-
-/*
- * threads(frame_per_block, batch_per_block)
- * grid(frame_blocks, batch_blocks)
- */
-template <class OpResetOutput, bool is_batch, typename T>
-__global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
-                                        T *gate_value, T *reset_output_value,
-                                        T *prev_output_value, int frame_size,
-                                        int batch_size,
-                                        ActivationType active_gate) {
-  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frame_idx >= frame_size) return;
-
-  int batch_idx = 0;
-  if (is_batch) {
-    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batch_idx >= batch_size) return;
-    gate_value += batch_idx * 3 * frame_size;
-    reset_output_value += batch_idx * frame_size;
-  }
-
-  T r_prev_out = 0;
-  T r_value_reset_output;
-  T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
-  T r_value_reset_gate = gate_value[frame_idx + frame_size * 1];
-
-  if (prev_output_value) {
-    if (is_batch) prev_output_value += batch_idx * frame_size;
-    r_prev_out = prev_output_value[frame_idx];
-  }
-
-  op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
-                  r_value_reset_output, active_gate);
-
-  gate_value[frame_idx + frame_size * 0] = r_value_update_gate;
-  gate_value[frame_idx + frame_size * 1] = r_value_reset_gate;
-  reset_output_value[frame_idx] = r_value_reset_output;
-}
-
-/*
- * threads(frame_per_block, batch_per_block)
- * grid(frame_blocks, batch_blocks)
- */
-template <class OpFinalOutput, bool is_batch, typename T>
-__global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
-                                        T *gate_value, T *prev_output_value,
-                                        T *output_value, int frame_size,
-                                        int batch_size,
-                                        ActivationType active_node) {
-  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frame_idx >= frame_size) return;
-  int batch_idx = 0;
-  if (is_batch) {
-    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batch_idx >= batch_size) return;
-    gate_value += batch_idx * 3 * frame_size;
-    output_value += batch_idx * frame_size;
-  }
-
-  T r_output;
-  T r_prev_out = 0;
-  T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
-  T r_value_frame_state = gate_value[frame_idx + frame_size * 2];
-
-  if (prev_output_value) {
-    if (is_batch) prev_output_value += batch_idx * frame_size;
-    r_prev_out = prev_output_value[frame_idx];
-  }
-
-  op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
-                  r_output, active_node);
-
-  gate_value[frame_idx + frame_size * 2] = r_value_frame_state;
-  output_value[frame_idx] = r_output;
-}
-
-/*
- * threads(frame_per_block, batch_per_block)
- * grid(frame_blocks, batch_blocks)
- */
-template <class OpStateGrad, bool is_batch, typename T>
-__global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
-                                       T *gate_grad, T *prev_out_value,
-                                       T *prev_out_grad, T *output_grad,
-                                       int frame_size, int batch_size,
-                                       ActivationType active_node) {
-  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frame_idx >= frame_size) return;
-  int batch_idx = 0;
-  if (is_batch) {
-    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batch_idx >= batch_size) return;
-    gate_value += batch_idx * 3 * frame_size;
-    gate_grad += batch_idx * 3 * frame_size;
-    output_grad += batch_idx * frame_size;
-  }
-
-  T r_update_gate_grad;
-  T r_frame_state_grad;
-  T r_prev_out_value = 0;
-  T r_prev_out_grad = 0;
-  T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
-  T r_frame_state_value = gate_value[frame_idx + frame_size * 2];
-  T r_out_grad = output_grad[frame_idx];
-
-  if (prev_out_value && prev_out_grad) {
-    if (is_batch) prev_out_value += batch_idx * frame_size;
-    r_prev_out_value = prev_out_value[frame_idx];
-
-    if (is_batch) prev_out_grad += batch_idx * frame_size;
-    r_prev_out_grad = prev_out_grad[frame_idx];
-  }
-
-  op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
-                r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
-                r_out_grad, active_node);
-
-  gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
-  gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad;
-  if (prev_out_grad) {
-    prev_out_grad[frame_idx] = r_prev_out_grad;
-  }
-}
-
-/*
- * threads(frame_per_block, batch_per_block)
- * grid(frame_blocks, batch_blocks)
- */
-template <class OpResetGrad, bool is_batch, typename T>
-__global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
-                                       T *gate_grad, T *prev_out_value,
-                                       T *prev_out_grad, T *reset_output_grad,
-                                       int frame_size, int batch_size,
-                                       ActivationType active_gate) {
-  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frame_idx >= frame_size) return;
-  int batch_idx = 0;
-  if (is_batch) {
-    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batch_idx >= batch_size) return;
-    gate_value += batch_idx * 3 * frame_size;
-    gate_grad += batch_idx * 3 * frame_size;
-    reset_output_grad += batch_idx * frame_size;
-  }
-
-  T r_reset_gate_grad;
-  T r_prev_out_value = 0;
-  T r_prev_out_grad = 0;
-  T r_reset_output_grad = 0;
-  T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
-  T r_update_gate_grad = gate_grad[frame_idx + frame_size * 0];
-  T r_reset_gate_value = gate_value[frame_idx + frame_size * 1];
-
-  if (prev_out_value && prev_out_grad) {
-    if (is_batch) prev_out_value += batch_idx * frame_size;
-    if (is_batch) prev_out_grad += batch_idx * frame_size;
-    r_prev_out_value = prev_out_value[frame_idx];
-    r_prev_out_grad = prev_out_grad[frame_idx];
-    r_reset_output_grad = reset_output_grad[frame_idx];
-  }
-
-  op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
-                r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
-                r_reset_output_grad, active_gate);
-
-  gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
-  gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad;
-  if (prev_out_grad) {
-    prev_out_grad[frame_idx] = r_prev_out_grad;
-  }
-}
-}  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/detail/gru_kernel.h b/paddle/operators/math/detail/gru_kernel.h
deleted file mode 100644
index 4d8245cb5d03b33edbda5d8350be02b4fa87ab95..0000000000000000000000000000000000000000
--- a/paddle/operators/math/detail/gru_kernel.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/platform/hostdevice.h"
-
-#include <type_traits>
-
-// TODO(guosheng): refine code style in gru_kernel
-namespace paddle {
-namespace operators {
-namespace math {
-namespace detail {
-
-namespace forward {
-
-template <typename T>
-class gru_resetOutput {
- public:
-  HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate,
-                             T &prev_out, T &value_reset_output,
-                             ActivationType act_gate) {
-    value_update_gate = activation(value_update_gate, act_gate);
-    value_reset_gate = activation(value_reset_gate, act_gate);
-    value_reset_output = prev_out * value_reset_gate;
-  }
-#ifndef __NVCC__
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &value_update_gate,
-                             __m256 &value_reset_gate, __m256 &prev_out,
-                             __m256 &value_reset_output,
-                             ActivationType act_gate) {
-    value_update_gate = activation(value_update_gate, act_gate);
-    value_reset_gate = activation(value_reset_gate, act_gate);
-    value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate);
-  }
-#endif
-#endif
-};
-
-template <typename T>
-class gru_finalOutput {
- public:
-  HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state,
-                             T &prev_out, T &value_output,
-                             ActivationType act_input) {
-    value_frame_state = activation(value_frame_state, act_input);
-    value_output = prev_out - (value_update_gate * prev_out) +
-                   (value_update_gate * value_frame_state);
-  }
-#ifndef __NVCC__
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &value_update_gate,
-                             __m256 &value_frame_state, __m256 &prev_out,
-                             __m256 &value_output, ActivationType act_input) {
-    value_frame_state = activation(value_frame_state, act_input);
-    value_output = _mm256_add_ps(
-        _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)),
-        _mm256_mul_ps(value_update_gate, value_frame_state));
-  }
-#endif
-#endif
-};
-}  // namespace forward
-
-namespace backward {
-
-template <typename T>
-class gru_stateGrad {
- public:
-  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
-                             T &value_frame_state, T &grad_frame_state,
-                             T &value_prev_out, T &grad_prev_out,
-                             T &grad_output, ActivationType act_input) {
-    grad_update_gate = (grad_output * value_frame_state);
-    grad_update_gate -= (grad_output * value_prev_out);
-    grad_prev_out -= (grad_output * value_update_gate);
-    grad_prev_out += grad_output;
-    grad_frame_state = activation(grad_output * value_update_gate,
-                                  value_frame_state, act_input);
-  }
-#ifndef __NVCC__
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &value_update_gate,
-                             __m256 &grad_update_gate,
-                             __m256 &value_frame_state,
-                             __m256 &grad_frame_state, __m256 &value_prev_out,
-                             __m256 &grad_prev_out, __m256 &grad_output,
-                             ActivationType act_input) {
-    grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state);
-    grad_update_gate = _mm256_sub_ps(
-        grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out));
-    grad_prev_out = _mm256_add_ps(
-        _mm256_sub_ps(grad_prev_out,
-                      _mm256_mul_ps(grad_output, value_update_gate)),
-        grad_output);
-    grad_frame_state = activation(_mm256_mul_ps(grad_output, value_update_gate),
-                                  value_frame_state, act_input);
-  }
-#endif
-#endif
-};
-
-template <typename T>
-class gru_resetGrad {
- public:
-  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
-                             T &value_reset_gate, T &grad_reset_gate,
-                             T &value_prev_out, T &grad_prev_out,
-                             T &grad_reset_output, ActivationType act_gate) {
-    grad_reset_gate = (grad_reset_output * value_prev_out);
-    grad_prev_out += (grad_reset_output * value_reset_gate);
-    grad_update_gate =
-        activation(grad_update_gate, value_update_gate, act_gate);
-    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
-  }
-#ifndef __NVCC__
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &value_update_gate,
-                             __m256 &grad_update_gate, __m256 &value_reset_gate,
-                             __m256 &grad_reset_gate, __m256 &value_prev_out,
-                             __m256 &grad_prev_out, __m256 &grad_reset_output,
-                             ActivationType act_gate) {
-    grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out);
-    grad_prev_out = _mm256_add_ps(
-        grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate));
-    grad_update_gate =
-        activation(grad_update_gate, value_update_gate, act_gate);
-    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
-  }
-#endif
-#endif
-};
-
-}  // namespace backward
-
-}  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/detail/lstm_cpu_kernel.h b/paddle/operators/math/detail/lstm_cpu_kernel.h
deleted file mode 100644
index 42888fcdb0a464892e3007ee73c195fcd2a431bb..0000000000000000000000000000000000000000
--- a/paddle/operators/math/detail/lstm_cpu_kernel.h
+++ /dev/null
@@ -1,312 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <type_traits>
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/operators/math/lstm_compute.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace detail {
-
-#ifndef __NVCC__
-
-template <class T, class Op>
-void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                     int frame_size, ActivationType active_node,
-                                     ActivationType active_gate,
-                                     ActivationType active_state) {
-  T r_value_in;
-  T r_value_ig;
-  T r_value_fg;
-  T r_value_og;
-  T r_checkI;
-  T r_checkF;
-  T r_checkO;
-  T r_state;
-  T r_prev_state = 0;
-  T r_state_atv;
-  T r_out;
-
-  T *value_in = value.gate_value;
-  T *value_ig = value.gate_value + frame_size;
-  T *value_fg = value.gate_value + frame_size * 2;
-  T *value_og = value.gate_value + frame_size * 3;
-
-  for (int i = 0; i < frame_size; i++) {
-    r_value_in = value_in[i];
-    r_value_ig = value_ig[i];
-    r_value_fg = value_fg[i];
-    r_value_og = value_og[i];
-    r_checkI = value.check_ig ? value.check_ig[i] : 0;
-    r_checkF = value.check_fg ? value.check_fg[i] : 0;
-    r_checkO = value.check_og ? value.check_og[i] : 0;
-
-    if (value.prev_state_value) {
-      r_prev_state = value.prev_state_value[i];
-    }
-
-    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
-       r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node,
-       active_gate, active_state);
-
-    value_in[i] = r_value_in;
-    value_ig[i] = r_value_ig;
-    value_fg[i] = r_value_fg;
-    value_og[i] = r_value_og;
-    value.state_value[i] = r_state;
-    value.state_active_value[i] = r_state_atv;
-    value.output_value[i] = r_out;
-  }
-}
-
-template <class T, class Op>
-void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
-                                      LstmMetaGrad<T> grad, int frame_size,
-                                      ActivationType active_node,
-                                      ActivationType active_gate,
-                                      ActivationType active_state) {
-  T r_value_in;
-  T r_value_ig;
-  T r_value_fg;
-  T r_value_og;
-  T r_grad_in;
-  T r_grad_ig;
-  T r_grad_fg;
-  T r_grad_og;
-  T r_prev_state = 0;
-  T r_prev_state_grad;
-  T r_state;
-  T r_state_grad;
-  T r_state_atv;
-  T r_output_grad;
-  T r_checkI;
-  T r_checkF;
-  T r_checkO;
-  T r_checkIGrad;
-  T r_checkFGrad;
-  T r_checkOGrad;
-
-  T *value_in = value.gate_value;
-  T *value_ig = value.gate_value + frame_size;
-  T *value_fg = value.gate_value + frame_size * 2;
-  T *value_og = value.gate_value + frame_size * 3;
-  T *grad_in = grad.gate_grad;
-  T *grad_ig = grad.gate_grad + frame_size;
-  T *grad_fg = grad.gate_grad + frame_size * 2;
-  T *grad_og = grad.gate_grad + frame_size * 3;
-
-  for (int i = 0; i < frame_size; i++) {
-    r_value_in = value_in[i];
-    r_value_ig = value_ig[i];
-    r_value_fg = value_fg[i];
-    r_value_og = value_og[i];
-    r_checkI = value.check_ig ? value.check_ig[i] : 0;
-    r_checkF = value.check_fg ? value.check_fg[i] : 0;
-    r_checkO = value.check_og ? value.check_og[i] : 0;
-    r_state = value.state_value[i];
-    r_state_atv = value.state_active_value[i];
-    r_output_grad = grad.output_grad[i];
-    r_state_grad = grad.state_grad[i];
-    if (value.prev_state_value) {
-      r_prev_state = value.prev_state_value[i];
-    }
-
-    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
-       r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
-       r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
-       r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
-       active_state);
-
-    grad_in[i] = r_grad_in;
-    grad_ig[i] = r_grad_ig;
-    grad_fg[i] = r_grad_fg;
-    grad_og[i] = r_grad_og;
-    grad.state_grad[i] = r_state_grad;
-
-    if (grad.prev_state_grad) grad.prev_state_grad[i] = r_prev_state_grad;
-    if (value.prev_state_value) {
-      if (grad.check_ig_grad) grad.check_ig_grad[i] += r_checkIGrad;
-      if (grad.check_fg_grad) grad.check_fg_grad[i] += r_checkFGrad;
-    }
-    if (grad.check_og_grad) grad.check_og_grad[i] += r_checkOGrad;
-  }
-}
-
-template <class T, class Op>
-void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                   int frame_size, ActivationType active_node,
-                                   ActivationType active_gate,
-                                   ActivationType active_state) {
-#ifdef __AVX__
-  __m256 r_value_in;
-  __m256 r_value_ig;
-  __m256 r_value_fg;
-  __m256 r_value_og;
-  __m256 r_checkI = _mm256_set1_ps(0.0f);
-  __m256 r_checkF = _mm256_set1_ps(0.0f);
-  __m256 r_checkO = _mm256_set1_ps(0.0f);
-  __m256 r_state;
-  __m256 r_prev_state = _mm256_set1_ps(0.0f);
-  __m256 r_state_atv;
-  __m256 r_out;
-
-  __m256 *value_in = (__m256 *)value.gate_value;
-  __m256 *value_ig = (__m256 *)(value.gate_value + frame_size);
-  __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2);
-  __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3);
-
-  for (int i = 0; i < frame_size / 8; i++) {
-    r_value_in = value_in[i];
-    r_value_ig = value_ig[i];
-    r_value_fg = value_fg[i];
-    r_value_og = value_og[i];
-    if (value.check_ig) {
-      r_checkI = ((__m256 *)value.check_ig)[i];
-      r_checkF = ((__m256 *)value.check_fg)[i];
-      r_checkO = ((__m256 *)value.check_og)[i];
-    }
-
-    if (value.prev_state_value) {
-      r_prev_state = ((__m256 *)value.prev_state_value)[i];
-    }
-
-    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
-       r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node,
-       active_gate, active_state);
-
-    value_in[i] = r_value_in;
-    value_ig[i] = r_value_ig;
-    value_fg[i] = r_value_fg;
-    value_og[i] = r_value_og;
-    ((__m256 *)value.state_value)[i] = r_state;
-    ((__m256 *)value.state_active_value)[i] = r_state_atv;
-    ((__m256 *)value.output_value)[i] = r_out;
-  }
-#endif
-}
-
-template <class T, class Op>
-void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
-                                    LstmMetaGrad<T> grad, int frame_size,
-                                    ActivationType active_node,
-                                    ActivationType active_gate,
-                                    ActivationType active_state) {
-#ifdef __AVX__
-  __m256 r_value_in;
-  __m256 r_value_ig;
-  __m256 r_value_fg;
-  __m256 r_value_og;
-  __m256 r_grad_in;
-  __m256 r_grad_ig;
-  __m256 r_grad_fg;
-  __m256 r_grad_og;
-  __m256 r_prev_state = _mm256_set1_ps(0.0f);
-  __m256 r_prev_state_grad;
-  __m256 r_state_grad;
-  __m256 r_state;
-  __m256 r_state_atv;
-  __m256 r_output_grad;
-  __m256 r_checkI = _mm256_set1_ps(0.0f);
-  __m256 r_checkF = _mm256_set1_ps(0.0f);
-  __m256 r_checkO = _mm256_set1_ps(0.0f);
-  __m256 r_checkIGrad;
-  __m256 r_checkFGrad;
-  __m256 r_checkOGrad;
-
-  __m256 *value_in = (__m256 *)value.gate_value;
-  __m256 *value_ig = (__m256 *)(value.gate_value + frame_size);
-  __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2);
-  __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3);
-  __m256 *grad_in = (__m256 *)grad.gate_grad;
-  __m256 *grad_ig = (__m256 *)(grad.gate_grad + frame_size);
-  __m256 *grad_fg = (__m256 *)(grad.gate_grad + frame_size * 2);
-  __m256 *grad_og = (__m256 *)(grad.gate_grad + frame_size * 3);
-
-  for (int i = 0; i < frame_size / 8; i++) {
-    r_value_in = value_in[i];
-    r_value_ig = value_ig[i];
-    r_value_fg = value_fg[i];
-    r_value_og = value_og[i];
-    if (value.check_ig) {
-      r_checkI = ((__m256 *)value.check_ig)[i];
-      r_checkF = ((__m256 *)value.check_fg)[i];
-      r_checkO = ((__m256 *)value.check_og)[i];
-    }
-    r_state = ((__m256 *)value.state_value)[i];
-    r_state_atv = ((__m256 *)value.state_active_value)[i];
-    r_output_grad = ((__m256 *)grad.output_grad)[i];
-    r_state_grad = ((__m256 *)grad.state_grad)[i];
-    if (value.prev_state_value) {
-      r_prev_state = ((__m256 *)value.prev_state_value)[i];
-    }
-
-    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
-       r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
-       r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
-       r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
-       active_state);
-
-    grad_in[i] = r_grad_in;
-    grad_ig[i] = r_grad_ig;
-    grad_fg[i] = r_grad_fg;
-    grad_og[i] = r_grad_og;
-    ((__m256 *)grad.state_grad)[i] = r_state_grad;
-
-    if (grad.prev_state_grad)
-      ((__m256 *)grad.prev_state_grad)[i] = r_prev_state_grad;
-    if (value.prev_state_value) {
-      if (grad.check_ig_grad) ((__m256 *)grad.check_ig_grad)[i] += r_checkIGrad;
-      if (grad.check_fg_grad) ((__m256 *)grad.check_fg_grad)[i] += r_checkFGrad;
-    }
-    if (grad.check_og_grad) ((__m256 *)grad.check_og_grad)[i] += r_checkOGrad;
-  }
-#endif
-}
-
-template <class T, class Op>
-void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frame_size,
-                      ActivationType active_node, ActivationType active_gate,
-                      ActivationType active_state) {
-  if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
-    avx_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
-                                     active_gate, active_state);
-  } else {
-    naive_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
-                                       active_gate, active_state);
-  }
-}
-
-template <class T, class Op>
-void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frame_size, ActivationType active_node,
-                       ActivationType active_gate,
-                       ActivationType active_state) {
-  if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
-    avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, active_node,
-                                      active_gate, active_state);
-  } else {
-    naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size,
-                                        active_node, active_gate, active_state);
-  }
-}
-
-#endif
-
-}  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/detail/lstm_gpu_kernel.h b/paddle/operators/math/detail/lstm_gpu_kernel.h
deleted file mode 100644
index e31e657e8b6964c2b99f6e456545c83d8da8e7f9..0000000000000000000000000000000000000000
--- a/paddle/operators/math/detail/lstm_gpu_kernel.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/operators/math/lstm_compute.h"
-#include "paddle/platform/cuda_helper.h"
-#include "paddle/platform/device_context.h"
-
-#include <type_traits>
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace detail {
-
-/*
- * threads(frame_per_block, batch_per_block)
- * grid(frame_blocks, batch_blocks)
- */
-template <class T, class Op, bool is_batch>
-__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
-                              int batch_size, ActivationType active_node,
-                              ActivationType active_gate,
-                              ActivationType active_state) {
-  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frame_idx >= frame_size) return;
-
-  int batch_idx = 0;
-  if (is_batch) {
-    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batch_idx >= batch_size) return;
-    value.gate_value += batch_idx * frame_size * 4;
-    value.output_value += batch_idx * frame_size;
-    value.state_value += batch_idx * frame_size;
-    value.state_active_value += batch_idx * frame_size;
-  }
-
-  T r_state;
-  T r_prev_state = 0;
-  T r_state_atv;
-  T r_out;
-  T r_value_in;
-  T r_value_ig;
-  T r_value_fg;
-  T r_value_og;
-
-  T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0;
-  T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0;
-  T r_checkO = value.check_og ? value.check_og[frame_idx] : 0;
-
-  r_value_in = value.gate_value[frame_idx];
-  r_value_ig = value.gate_value[frame_idx + frame_size];
-  r_value_fg = value.gate_value[frame_idx + frame_size * 2];
-  r_value_og = value.gate_value[frame_idx + frame_size * 3];
-
-  if (value.prev_state_value) {
-    if (is_batch) value.prev_state_value += batch_idx * frame_size;
-    r_prev_state = value.prev_state_value[frame_idx];
-  }
-
-  op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
-     r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, active_gate,
-     active_state);
-
-  value.gate_value[frame_idx] = r_value_in;
-  value.gate_value[frame_idx + frame_size] = r_value_ig;
-  value.gate_value[frame_idx + frame_size * 2] = r_value_fg;
-  value.gate_value[frame_idx + frame_size * 3] = r_value_og;
-
-  value.state_value[frame_idx] = r_state;
-  value.state_active_value[frame_idx] = r_state_atv;
-  value.output_value[frame_idx] = r_out;
-}
-
-/*
- * threads(frame_per_block, batch_per_block)
- * grid(frame_blocks, batch_blocks)
- */
-template <class T, class Op, bool is_batch>
-__global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
-                               LstmMetaGrad<T> grad, int frame_size,
-                               int batch_size, ActivationType active_node,
-                               ActivationType active_gate,
-                               ActivationType active_state) {
-  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frame_idx >= frame_size) return;
-
-  int batch_idx = 0;
-  if (is_batch) {
-    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batch_idx >= batch_size) return;
-    value.gate_value += batch_idx * frame_size * 4;
-    value.state_value += batch_idx * frame_size;
-    value.state_active_value += batch_idx * frame_size;
-    grad.gate_grad += batch_idx * frame_size * 4;
-    grad.state_grad += batch_idx * frame_size;
-    grad.output_grad += batch_idx * frame_size;
-  }
-
-  T r_value_in;
-  T r_value_ig;
-  T r_value_fg;
-  T r_value_og;
-  T r_grad_in;
-  T r_grad_ig;
-  T r_grad_fg;
-  T r_grad_og;
-  T r_prev_state = 0;
-  T r_prev_state_grad;
-  T r_state;
-  T r_state_grad;
-  T r_state_atv;
-  T r_output_grad;
-  T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0;
-  T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0;
-  T r_checkO = value.check_og ? value.check_og[frame_idx] : 0;
-
-  T r_checkIGrad;
-  T r_checkFGrad;
-  T r_checkOGrad;
-
-  r_value_in = value.gate_value[frame_idx];
-  r_value_ig = value.gate_value[frame_idx + frame_size];
-  r_value_fg = value.gate_value[frame_idx + frame_size * 2];
-  r_value_og = value.gate_value[frame_idx + frame_size * 3];
-  r_state = value.state_value[frame_idx];
-  r_state_atv = value.state_active_value[frame_idx];
-  r_output_grad = grad.output_grad[frame_idx];
-  r_state_grad = grad.state_grad[frame_idx];
-
-  if (value.prev_state_value) {
-    if (is_batch) value.prev_state_value += batch_idx * frame_size;
-    r_prev_state = value.prev_state_value[frame_idx];
-  }
-
-  op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
-     r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
-     r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
-     r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
-     active_state);
-
-  grad.gate_grad[frame_idx] = r_grad_in;
-  grad.gate_grad[frame_idx + frame_size] = r_grad_ig;
-  grad.gate_grad[frame_idx + frame_size * 2] = r_grad_fg;
-  grad.gate_grad[frame_idx + frame_size * 3] = r_grad_og;
-  grad.state_grad[frame_idx] = r_state_grad;
-  if (grad.prev_state_grad) {
-    if (is_batch) grad.prev_state_grad += batch_idx * frame_size;
-    grad.prev_state_grad[frame_idx] = r_prev_state_grad;
-  }
-
-  if (is_batch) {
-    if (value.prev_state_value) {
-      if (grad.check_ig_grad)
-        paddle::platform::CudaAtomicAdd(grad.check_ig_grad + frame_idx,
-                                        r_checkIGrad);
-      if (grad.check_fg_grad)
-        paddle::platform::CudaAtomicAdd(grad.check_fg_grad + frame_idx,
-                                        r_checkFGrad);
-    }
-    if (grad.check_og_grad)
-      paddle::platform::CudaAtomicAdd(grad.check_og_grad + frame_idx,
-                                      r_checkOGrad);
-  } else {
-    if (value.prev_state_value) {
-      if (grad.check_ig_grad) grad.check_ig_grad[frame_idx] += r_checkIGrad;
-      if (grad.check_fg_grad) grad.check_fg_grad[frame_idx] += r_checkFGrad;
-    }
-    if (grad.check_og_grad) grad.check_og_grad[frame_idx] += r_checkOGrad;
-  }
-}
-
-template <class T, class Op>
-void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
-                      LstmMetaValue<T> value, int frame_size, int batch_size,
-                      ActivationType active_node, ActivationType active_gate,
-                      ActivationType active_state) {
-  dim3 threads;
-  dim3 grid;
-  if (batch_size == 1) {
-    int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
-    int frame_blocks = (frame_size + 1024 - 1) / 1024;
-    threads = dim3(frame_per_block, 1);
-    grid = dim3(frame_blocks, 1);
-  } else {
-    /* frame_per_block = 32 batch_per_block = 32 */
-    threads = dim3(32, 32);
-    grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
-  }
-
-  auto stream =
-      reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
-  if (batch_size == 1) {
-    KeLstmForward<T, Op,
-                  /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, frame_size, batch_size, active_node, active_gate,
-        active_state);
-  } else {
-    KeLstmForward<T, Op,
-                  /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, frame_size, batch_size, active_node, active_gate,
-        active_state);
-  }
-}
-
-template <class T, class Op>
-void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
-                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frame_size, int batch_size,
-                       ActivationType active_node, ActivationType active_gate,
-                       ActivationType active_state) {
-  dim3 threads;
-  dim3 grid;
-  if (batch_size == 1) {
-    int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
-    int frame_blocks = (frame_size + 1024 - 1) / 1024;
-    threads = dim3(frame_per_block, 1);
-    grid = dim3(frame_blocks, 1);
-  } else {
-    /* frame_per_block = 32 batch_per_block = 16 */
-    threads = dim3(32, 16);
-    grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16);
-  }
-
-  auto stream =
-      reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
-  if (batch_size == 1) {
-    KeLstmBackward<T, Op,
-                   /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, grad, frame_size, batch_size, active_node, active_gate,
-        active_state);
-  } else {
-    KeLstmBackward<T, Op,
-                   /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, grad, frame_size, batch_size, active_node, active_gate,
-        active_state);
-  }
-}
-
-}  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/detail/lstm_kernel.h b/paddle/operators/math/detail/lstm_kernel.h
deleted file mode 100644
index fed8f9c4ca48905ad4c524ba400e8c7bb2f7fbd1..0000000000000000000000000000000000000000
--- a/paddle/operators/math/detail/lstm_kernel.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/platform/hostdevice.h"
-
-#include <type_traits>
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace detail {
-
-namespace forward {
-
-template <class T>
-class lstm {
- public:
-  HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
-                             T &prev_state, T &state, T &state_atv, T &output,
-                             T &checkI, T &checkF, T &checkO,
-                             ActivationType active_node,
-                             ActivationType active_gate,
-                             ActivationType active_state) {
-    value_in = activation(value_in, active_node);
-    value_ig = activation(value_ig + prev_state * checkI, active_gate);
-    value_fg = activation(value_fg + prev_state * checkF, active_gate);
-    state = value_in * value_ig + prev_state * value_fg;
-    value_og = activation(value_og + state * checkO, active_gate);
-    state_atv = activation(state, active_state);
-    output = value_og * state_atv;
-  }
-#ifndef __NVCC__
-#ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
-  static const bool avx = false;
-#else
-  // Only float support AVX optimization
-  static const bool avx = std::is_same<T, float>::value;
-
-  HOSTDEVICE void operator()(__m256 &value_in, __m256 &value_ig,
-                             __m256 &value_fg, __m256 &value_og,
-                             __m256 &prev_state, __m256 &state,
-                             __m256 &state_atv, __m256 &output, __m256 &checkI,
-                             __m256 &checkF, __m256 &checkO,
-                             ActivationType active_node,
-                             ActivationType active_gate,
-                             ActivationType active_state) {
-    value_in = activation(value_in, active_node);
-    value_ig =
-        activation(_mm256_add_ps(value_ig, _mm256_mul_ps(prev_state, checkI)),
-                   active_gate);
-    value_fg =
-        activation(_mm256_add_ps(value_fg, _mm256_mul_ps(prev_state, checkF)),
-                   active_gate);
-    state = _mm256_add_ps(_mm256_mul_ps(value_in, value_ig),
-                          _mm256_mul_ps(prev_state, value_fg));
-    value_og = activation(_mm256_add_ps(value_og, _mm256_mul_ps(state, checkO)),
-                          active_gate);
-    state_atv = activation(state, active_state);
-    output = _mm256_mul_ps(value_og, state_atv);
-  }
-#endif
-#endif
-};
-
-}  // namespace forward
-
-namespace backward {
-
-template <class T>
-class lstm {
- public:
-  HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
-                             T &grad_in, T &grad_ig, T &grad_fg, T &grad_og,
-                             T &prev_state, T &prev_state_grad, T &state,
-                             T &state_grad, T &state_atv, T &output_grad,
-                             T &checkI, T &checkF, T &checkO, T &checkIGrad,
-                             T &checkFGrad, T &checkOGrad,
-                             ActivationType active_node,
-                             ActivationType active_gate,
-                             ActivationType active_state) {
-    grad_og = activation(output_grad * state_atv, value_og, active_gate);
-    state_grad += activation(output_grad * value_og, state_atv, active_state) +
-                  grad_og * checkO;
-    grad_in = activation(state_grad * value_ig, value_in, active_node);
-    grad_ig = activation(state_grad * value_in, value_ig, active_gate);
-    grad_fg = activation(state_grad * prev_state, value_fg, active_gate);
-    prev_state_grad =
-        grad_ig * checkI + grad_fg * checkF + state_grad * value_fg;
-    checkIGrad = grad_ig * prev_state;
-    checkFGrad = grad_fg * prev_state;
-    checkOGrad = grad_og * state;
-  }
-#ifndef __NVCC__
-#ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
-  static const bool avx = false;
-#else
-  // Only float support AVX optimization
-  static const bool avx = std::is_same<T, float>::value;
-  HOSTDEVICE void operator()(
-      __m256 &value_in, __m256 &value_ig, __m256 &value_fg, __m256 &value_og,
-      __m256 &grad_in, __m256 &grad_ig, __m256 &grad_fg, __m256 &grad_og,
-      __m256 &prev_state, __m256 &prev_state_grad, __m256 &state,
-      __m256 &state_grad, __m256 &state_atv, __m256 &output_grad,
-      __m256 &checkI, __m256 &checkF, __m256 &checkO, __m256 &checkIGrad,
-      __m256 &checkFGrad, __m256 &checkOGrad, ActivationType active_node,
-      ActivationType active_gate, ActivationType active_state) {
-    grad_og = activation(_mm256_mul_ps(output_grad, state_atv), value_og,
-                         active_gate);
-    state_grad = _mm256_add_ps(activation(_mm256_mul_ps(output_grad, value_og),
-                                          state_atv, active_state),
-                               state_grad);
-    state_grad = _mm256_add_ps(_mm256_mul_ps(grad_og, checkO), state_grad);
-    grad_in =
-        activation(_mm256_mul_ps(state_grad, value_ig), value_in, active_node);
-    grad_ig =
-        activation(_mm256_mul_ps(state_grad, value_in), value_ig, active_gate);
-    grad_fg = activation(_mm256_mul_ps(state_grad, prev_state), value_fg,
-                         active_gate);
-    prev_state_grad = _mm256_add_ps(_mm256_mul_ps(grad_ig, checkI),
-                                    _mm256_mul_ps(grad_fg, checkF));
-    prev_state_grad =
-        _mm256_add_ps(_mm256_mul_ps(state_grad, value_fg), prev_state_grad);
-    checkIGrad = _mm256_mul_ps(grad_ig, prev_state);
-    checkFGrad = _mm256_mul_ps(grad_fg, prev_state);
-    checkOGrad = _mm256_mul_ps(grad_og, state);
-  }
-#endif
-#endif
-};
-
-}  // namespace backward
-
-}  // namespace detail
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/detection_util.h b/paddle/operators/math/detection_util.h
deleted file mode 100644
index e3a3ef2badc37924d866ded8ee7a7338fbc4b2d2..0000000000000000000000000000000000000000
--- a/paddle/operators/math/detection_util.h
+++ /dev/null
@@ -1,300 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <map>
-#include "paddle/framework/selected_rows.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-template <typename T>
-struct BBox {
-  BBox(T x_min, T y_min, T x_max, T y_max)
-      : x_min(x_min),
-        y_min(y_min),
-        x_max(x_max),
-        y_max(y_max),
-        is_difficult(false) {}
-
-  BBox() {}
-
-  T get_width() const { return x_max - x_min; }
-
-  T get_height() const { return y_max - y_min; }
-
-  T get_center_x() const { return (x_min + x_max) / 2; }
-
-  T get_center_y() const { return (y_min + y_max) / 2; }
-
-  T get_area() const { return get_width() * get_height(); }
-
-  // coordinate of bounding box
-  T x_min;
-  T y_min;
-  T x_max;
-  T y_max;
-  // whether difficult object (e.g. object with heavy occlusion is difficult)
-  bool is_difficult;
-};
-// KNCHW ==> NHWC
-// template <typename T>
-template <typename T>
-void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes,
-                          std::vector<BBox<T>>& bbox_vec);
-template <typename T>
-void GetBBoxVarFromPriorData(const T* prior_data, const size_t num,
-                             std::vector<std::vector<T>>& var_vec);
-template <typename T>
-BBox<T> DecodeBBoxWithVar(BBox<T>& prior_bbox,
-                          const std::vector<T>& prior_bbox_var,
-                          const std::vector<T>& loc_pred_data);
-template <typename T1, typename T2>
-bool SortScorePairDescend(const std::pair<T1, T2>& pair1,
-                          const std::pair<T1, T2>& pair2);
-template <typename T>
-bool SortScorePairDescend(const std::pair<T, BBox<T>>& pair1,
-                          const std::pair<T, BBox<T>>& pair2);
-template <typename T>
-T jaccard_overlap(const BBox<T>& bbox1, const BBox<T>& bbox2);
-
-template <typename T>
-void ApplyNmsFast(const std::vector<BBox<T>>& bboxes, const T* conf_score_data,
-                  size_t class_idx, size_t top_k, T conf_threshold,
-                  T nms_threshold, size_t num_priors, size_t num_classes,
-                  std::vector<size_t>* indices);
-template <typename T>
-int GetDetectionIndices(
-    const T* conf_data, const size_t num_priors, const size_t num_classes,
-    const size_t background_label_id, const size_t batch_size,
-    const T conf_threshold, const size_t nms_top_k, const T nms_threshold,
-    const size_t top_k,
-    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes,
-    std::vector<std::map<size_t, std::vector<size_t>>>* all_detection_indices);
-template <typename T>
-BBox<T> ClipBBox(const BBox<T>& bbox);
-template <typename T>
-void GetDetectionOutput(
-    const T* conf_data, const size_t num_kept, const size_t num_priors,
-    const size_t num_classes, const size_t batch_size,
-    const std::vector<std::map<size_t, std::vector<size_t>>>& all_indices,
-    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes, T* out_data);
-template <typename T>
-void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes,
-                          std::vector<BBox<T>>& bbox_vec) {
-  size_t out_offset = bbox_vec.size();
-  bbox_vec.resize(bbox_vec.size() + num_bboxes);
-  for (size_t i = 0; i < num_bboxes; ++i) {
-    BBox<T> bbox;
-    bbox.x_min = *(prior_data + i * 8);
-    bbox.y_min = *(prior_data + i * 8 + 1);
-    bbox.x_max = *(prior_data + i * 8 + 2);
-    bbox.y_max = *(prior_data + i * 8 + 3);
-    bbox_vec[out_offset + i] = bbox;
-  }
-}
-template <typename T>
-void GetBBoxVarFromPriorData(const T* prior_data, const size_t num,
-                             std::vector<std::vector<T>>& var_vec) {
-  size_t out_offset = var_vec.size();
-  var_vec.resize(var_vec.size() + num);
-  for (size_t i = 0; i < num; ++i) {
-    std::vector<T> var;
-    var.push_back(*(prior_data + i * 8 + 4));
-    var.push_back(*(prior_data + i * 8 + 5));
-    var.push_back(*(prior_data + i * 8 + 6));
-    var.push_back(*(prior_data + i * 8 + 7));
-    var_vec[out_offset + i] = var;
-  }
-}
-template <typename T>
-BBox<T> DecodeBBoxWithVar(BBox<T>& prior_bbox,
-                          const std::vector<T>& prior_bbox_var,
-                          const std::vector<T>& loc_pred_data) {
-  T prior_bbox_width = prior_bbox.get_width();
-  T prior_bbox_height = prior_bbox.get_height();
-  T prior_bbox_center_x = prior_bbox.get_center_x();
-  T prior_bbox_center_y = prior_bbox.get_center_y();
-
-  T decoded_bbox_center_x =
-      prior_bbox_var[0] * loc_pred_data[0] * prior_bbox_width +
-      prior_bbox_center_x;
-  T decoded_bbox_center_y =
-      prior_bbox_var[1] * loc_pred_data[1] * prior_bbox_height +
-      prior_bbox_center_y;
-  T decoded_bbox_width =
-      std::exp(prior_bbox_var[2] * loc_pred_data[2]) * prior_bbox_width;
-  T decoded_bbox_height =
-      std::exp(prior_bbox_var[3] * loc_pred_data[3]) * prior_bbox_height;
-
-  BBox<T> decoded_bbox;
-  decoded_bbox.x_min = decoded_bbox_center_x - decoded_bbox_width / 2;
-  decoded_bbox.y_min = decoded_bbox_center_y - decoded_bbox_height / 2;
-  decoded_bbox.x_max = decoded_bbox_center_x + decoded_bbox_width / 2;
-  decoded_bbox.y_max = decoded_bbox_center_y + decoded_bbox_height / 2;
-
-  return decoded_bbox;
-}
-template <typename T1, typename T2>
-bool SortScorePairDescend(const std::pair<T1, T2>& pair1,
-                          const std::pair<T1, T2>& pair2) {
-  return pair1.first > pair2.first;
-}
-template <typename T>
-T jaccard_overlap(const BBox<T>& bbox1, const BBox<T>& bbox2) {
-  if (bbox2.x_min > bbox1.x_max || bbox2.x_max < bbox1.x_min ||
-      bbox2.y_min > bbox1.y_max || bbox2.y_max < bbox1.y_min) {
-    return 0.0;
-  } else {
-    T inter_x_min = std::max(bbox1.x_min, bbox2.x_min);
-    T inter_y_min = std::max(bbox1.y_min, bbox2.y_min);
-    T interX_max = std::min(bbox1.x_max, bbox2.x_max);
-    T interY_max = std::min(bbox1.y_max, bbox2.y_max);
-
-    T inter_width = interX_max - inter_x_min;
-    T inter_height = interY_max - inter_y_min;
-    T inter_area = inter_width * inter_height;
-
-    T bbox_area1 = bbox1.get_area();
-    T bbox_area2 = bbox2.get_area();
-
-    return inter_area / (bbox_area1 + bbox_area2 - inter_area);
-  }
-}
-
-template <typename T>
-void ApplyNmsFast(const std::vector<BBox<T>>& bboxes, const T* conf_score_data,
-                  size_t class_idx, size_t top_k, T conf_threshold,
-                  T nms_threshold, size_t num_priors, size_t num_classes,
-                  std::vector<size_t>* indices) {
-  std::vector<std::pair<T, size_t>> scores;
-  for (size_t i = 0; i < num_priors; ++i) {
-    size_t conf_offset = i * num_classes + class_idx;
-    if (conf_score_data[conf_offset] > conf_threshold)
-      scores.push_back(std::make_pair(conf_score_data[conf_offset], i));
-  }
-  std::stable_sort(scores.begin(), scores.end(),
-                   SortScorePairDescend<T, size_t>);
-  if (top_k > 0 && top_k < scores.size()) scores.resize(top_k);
-  while (scores.size() > 0) {
-    const size_t idx = scores.front().second;
-    bool keep = true;
-    for (size_t i = 0; i < indices->size(); ++i) {
-      if (keep) {
-        const size_t saved_idx = (*indices)[i];
-        T overlap = jaccard_overlap<T>(bboxes[idx], bboxes[saved_idx]);
-        keep = overlap <= nms_threshold;
-      } else {
-        break;
-      }
-    }
-    if (keep) indices->push_back(idx);
-    scores.erase(scores.begin());
-  }
-}
-template <typename T>
-int GetDetectionIndices(
-    const T* conf_data, const size_t num_priors, const size_t num_classes,
-    const size_t background_label_id, const size_t batch_size,
-    const T conf_threshold, const size_t nms_top_k, const T nms_threshold,
-    const size_t top_k,
-    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes,
-    std::vector<std::map<size_t, std::vector<size_t>>>* all_detection_indices) {
-  int total_keep_num = 0;
-  for (size_t n = 0; n < batch_size; ++n) {
-    const std::vector<BBox<T>>& decoded_bboxes = all_decoded_bboxes[n];
-    size_t num_detected = 0;
-    std::map<size_t, std::vector<size_t>> indices;
-    size_t conf_offset = n * num_priors * num_classes;
-    for (size_t c = 0; c < num_classes; ++c) {
-      if (c == background_label_id) continue;
-      ApplyNmsFast<T>(decoded_bboxes, conf_data + conf_offset, c, nms_top_k,
-                      conf_threshold, nms_threshold, num_priors, num_classes,
-                      &(indices[c]));
-      num_detected += indices[c].size();
-    }
-    if (top_k > 0 && num_detected > top_k) {
-      // std::vector<pair<T,T>> score_index_pairs;
-      std::vector<std::pair<T, std::pair<size_t, size_t>>> score_index_pairs;
-      for (size_t c = 0; c < num_classes; ++c) {
-        const std::vector<size_t>& label_indices = indices[c];
-        for (size_t i = 0; i < label_indices.size(); ++i) {
-          size_t idx = label_indices[i];
-          score_index_pairs.push_back(
-              std::make_pair((conf_data + conf_offset)[idx * num_classes + c],
-                             std::make_pair(c, idx)));
-        }
-      }
-      std::sort(score_index_pairs.begin(), score_index_pairs.end(),
-                SortScorePairDescend<T, std::pair<size_t, size_t>>);
-      score_index_pairs.resize(top_k);
-      std::map<size_t, std::vector<size_t>> new_indices;
-      for (size_t i = 0; i < score_index_pairs.size(); ++i) {
-        size_t label = score_index_pairs[i].second.first;
-        size_t idx = score_index_pairs[i].second.second;
-        new_indices[label].push_back(idx);
-      }
-      all_detection_indices->push_back(new_indices);
-      total_keep_num += top_k;
-    } else {
-      all_detection_indices->push_back(indices);
-      total_keep_num += num_detected;
-    }
-  }
-  return total_keep_num;
-}
-template <typename T>
-BBox<T> ClipBBox(const BBox<T>& bbox) {
-  T one = static_cast<T>(1.0);
-  T zero = static_cast<T>(0.0);
-  BBox<T> clipped_bbox;
-  clipped_bbox.x_min = std::max(std::min(bbox.x_min, one), zero);
-  clipped_bbox.y_min = std::max(std::min(bbox.y_min, one), zero);
-  clipped_bbox.x_max = std::max(std::min(bbox.x_max, one), zero);
-  clipped_bbox.y_max = std::max(std::min(bbox.y_max, one), zero);
-  return clipped_bbox;
-}
-template <typename T>
-void GetDetectionOutput(
-    const T* conf_data, const size_t num_kept, const size_t num_priors,
-    const size_t num_classes, const size_t batch_size,
-    const std::vector<std::map<size_t, std::vector<size_t>>>& all_indices,
-    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes, T* out_data) {
-  size_t count = 0;
-  for (size_t n = 0; n < batch_size; ++n) {
-    for (std::map<size_t, std::vector<size_t>>::const_iterator it =
-             all_indices[n].begin();
-         it != all_indices[n].end(); ++it) {
-      size_t label = it->first;
-      const std::vector<size_t>& indices = it->second;
-      const std::vector<BBox<T>>& decoded_bboxes = all_decoded_bboxes[n];
-      for (size_t i = 0; i < indices.size(); ++i) {
-        size_t idx = indices[i];
-        size_t conf_offset = n * num_priors * num_classes + idx * num_classes;
-        out_data[count * 7] = n;
-        out_data[count * 7 + 1] = label;
-        out_data[count * 7 + 2] = (conf_data + conf_offset)[label];
-        BBox<T> clipped_bbox = ClipBBox<T>(decoded_bboxes[idx]);
-        out_data[count * 7 + 3] = clipped_bbox.x_min;
-        out_data[count * 7 + 4] = clipped_bbox.y_min;
-        out_data[count * 7 + 5] = clipped_bbox.x_max;
-        out_data[count * 7 + 6] = clipped_bbox.y_max;
-        ++count;
-      }
-    }
-  }
-}
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/gru_compute.cc b/paddle/operators/math/gru_compute.cc
deleted file mode 100644
index 101ab859624869bf34d171cd42d46d0c5bdac29c..0000000000000000000000000000000000000000
--- a/paddle/operators/math/gru_compute.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/gru_compute.h"
-#include "paddle/operators/math/detail/gru_cpu_kernel.h"
-#include "paddle/operators/math/detail/gru_kernel.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
-  static void compute(const platform::CPUDeviceContext &context,
-                      GRUMetaValue<T> value, int frame_size, int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate) {
-#ifndef __NVCC__
-    if (value.prev_out_value) {
-      math::gemm<platform::CPUDeviceContext, T>(
-          context, false, false, batch_size, frame_size * 2, frame_size, 1,
-          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
-          1, value.gate_value, frame_size * 3);
-    }
-
-    detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
-                                 frame_size, batch_size, active_gate);
-
-    if (value.prev_out_value) {
-      math::gemm<platform::CPUDeviceContext, T>(
-          context, false, false, batch_size, frame_size, frame_size, 1,
-          value.reset_output_value, frame_size, value.state_weight, frame_size,
-          1, value.gate_value + frame_size * 2, frame_size * 3);
-    }
-
-    detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
-                                 frame_size, batch_size, active_node);
-#endif
-  }
-};
-
-template <typename T>
-struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
-  static void compute(const platform::CPUDeviceContext &context,
-                      GRUMetaValue<T> value, GRUMetaGrad<T> grad,
-                      int frame_size, int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate) {
-#ifndef __NVCC__
-    detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
-                                grad, frame_size, batch_size, active_node);
-
-    if (value.prev_out_value && grad.prev_out_grad) {
-      math::gemm<platform::CPUDeviceContext, T>(
-          context, false, true, batch_size, frame_size, frame_size, 1,
-          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
-          frame_size, 0, grad.reset_output_grad, frame_size);
-
-      if (grad.state_weight_grad) {
-        math::gemm<platform::CPUDeviceContext, T>(
-            context, true, false, frame_size, frame_size, batch_size, 1,
-            value.reset_output_value, frame_size,
-            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
-            grad.state_weight_grad, frame_size);
-      }
-    }
-
-    detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
-                                grad, frame_size, batch_size, active_gate);
-
-    if (grad.prev_out_grad && value.prev_out_value) {
-      math::gemm<platform::CPUDeviceContext, T>(
-          context, false, true, batch_size, frame_size, frame_size * 2, 1,
-          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
-          grad.prev_out_grad, frame_size);
-
-      if (grad.gate_weight_grad) {
-        math::gemm<platform::CPUDeviceContext, T>(
-            context, true, false, frame_size, frame_size * 2, batch_size, 1,
-            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
-            grad.gate_weight_grad, frame_size * 2);
-      }
-    }
-#endif
-  }
-};
-
-template struct GRUUnitFunctor<platform::CPUDeviceContext, float>;
-template struct GRUUnitFunctor<platform::CPUDeviceContext, double>;
-template struct GRUUnitGradFunctor<platform::CPUDeviceContext, float>;
-template struct GRUUnitGradFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/gru_compute.cu b/paddle/operators/math/gru_compute.cu
deleted file mode 100644
index d5a0e630ea0eadea990988c3170395c842a91900..0000000000000000000000000000000000000000
--- a/paddle/operators/math/gru_compute.cu
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/detail/gru_gpu_kernel.h"
-#include "paddle/operators/math/detail/gru_kernel.h"
-#include "paddle/operators/math/gru_compute.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
-  static void compute(const platform::CUDADeviceContext &context,
-                      GRUMetaValue<T> value, int frame_size, int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate) {
-    auto stream = context.stream();
-    dim3 threads;
-    dim3 grid;
-    if (batch_size == 1) {
-      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
-      int frame_blocks = (frame_size + 1024 - 1) / 1024;
-      threads = dim3(frame_per_block, 1);
-      grid = dim3(frame_blocks, 1);
-    } else {
-      threads = dim3(32, 32);
-      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
-    }
-
-    if (value.prev_out_value) {
-      math::gemm<platform::CUDADeviceContext, T>(
-          context, false, false, batch_size, frame_size * 2, frame_size, 1,
-          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
-          1, value.gate_value, frame_size * 3);
-    }
-
-    if (batch_size == 1) {
-      detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
-                                      /* is_batch= */ false,
-                                      T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_resetOutput<T>(), value.gate_value,
-          value.reset_output_value, value.prev_out_value, frame_size,
-          batch_size, active_gate);
-    } else {
-      detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
-                                      /* is_batch= */ true,
-                                      T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_resetOutput<T>(), value.gate_value,
-          value.reset_output_value, value.prev_out_value, frame_size,
-          batch_size, active_gate);
-    }
-
-    if (value.prev_out_value) {
-      math::gemm<platform::CUDADeviceContext, T>(
-          context, false, false, batch_size, frame_size, frame_size, 1,
-          value.reset_output_value, frame_size, value.state_weight, frame_size,
-          1, value.gate_value + frame_size * 2, frame_size * 3);
-    }
-
-    if (batch_size == 1) {
-      detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
-                                      /* is_batch= */ false,
-                                      T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_finalOutput<T>(), value.gate_value,
-          value.prev_out_value, value.output_value, frame_size, batch_size,
-          active_node);
-    } else {
-      detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
-                                      /* is_batch= */ true,
-                                      T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_finalOutput<T>(), value.gate_value,
-          value.prev_out_value, value.output_value, frame_size, batch_size,
-          active_node);
-    }
-  }
-};
-
-template <typename T>
-struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
-  static void compute(const platform::CUDADeviceContext &context,
-                      GRUMetaValue<T> value, GRUMetaGrad<T> grad,
-                      int frame_size, int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate) {
-    auto stream = context.stream();
-    dim3 threads;
-    dim3 grid;
-    if (batch_size == 1) {
-      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
-      int frame_blocks = (frame_size + 1024 - 1) / 1024;
-      threads = dim3(frame_per_block, 1);
-      grid = dim3(frame_blocks, 1);
-    } else {
-      threads = dim3(32, 32);
-      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
-    }
-
-    if (batch_size == 1) {
-      detail::KeGruBackwardStateGrad<
-          detail::backward::gru_stateGrad<T>,
-          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_stateGrad<T>(), value.gate_value,
-          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
-          grad.output_grad, frame_size, batch_size, active_node);
-    } else {
-      detail::KeGruBackwardStateGrad<
-          detail::backward::gru_stateGrad<T>,
-          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_stateGrad<T>(), value.gate_value,
-          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
-          grad.output_grad, frame_size, batch_size, active_node);
-    }
-
-    if (value.prev_out_value && grad.prev_out_grad) {
-      math::gemm<platform::CUDADeviceContext, T>(
-          context, false, true, batch_size, frame_size, frame_size, 1,
-          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
-          frame_size, 0, grad.reset_output_grad, frame_size);
-
-      if (grad.state_weight_grad) {
-        math::gemm<platform::CUDADeviceContext, T>(
-            context, true, false, frame_size, frame_size, batch_size, 1,
-            value.reset_output_value, frame_size,
-            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
-            grad.state_weight_grad, frame_size);
-      }
-    }
-
-    if (batch_size == 1) {
-      detail::KeGruBackwardResetGrad<
-          detail::backward::gru_resetGrad<T>,
-          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_resetGrad<T>(), value.gate_value,
-          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
-          grad.reset_output_grad, frame_size, batch_size, active_gate);
-    } else {
-      detail::KeGruBackwardResetGrad<
-          detail::backward::gru_resetGrad<T>,
-          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_resetGrad<T>(), value.gate_value,
-          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
-          grad.reset_output_grad, frame_size, batch_size, active_gate);
-    }
-
-    if (grad.prev_out_grad && value.prev_out_value) {
-      math::gemm<platform::CUDADeviceContext, T>(
-          context, false, true, batch_size, frame_size, frame_size * 2, 1,
-          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
-          grad.prev_out_grad, frame_size);
-
-      if (grad.gate_weight_grad) {
-        math::gemm<platform::CUDADeviceContext, T>(
-            context, true, false, frame_size, frame_size * 2, batch_size, 1,
-            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
-            grad.gate_weight_grad, frame_size * 2);
-      }
-    }
-  }
-};
-
-template struct GRUUnitFunctor<platform::CUDADeviceContext, float>;
-template struct GRUUnitFunctor<platform::CUDADeviceContext, double>;
-template struct GRUUnitGradFunctor<platform::CUDADeviceContext, float>;
-template struct GRUUnitGradFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/gru_compute.h b/paddle/operators/math/gru_compute.h
deleted file mode 100644
index bf69147b506661692a6d71823043cd3506ea8b5d..0000000000000000000000000000000000000000
--- a/paddle/operators/math/gru_compute.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-struct GRUMetaValue {
-  T *gate_weight;
-  T *state_weight;
-  T *gate_value;
-  T *reset_output_value;
-  T *output_value;
-  T *prev_out_value;
-};
-
-template <typename T>
-struct GRUMetaGrad {
-  T *gate_weight_grad;
-  T *state_weight_grad;
-  T *gate_grad;
-  T *reset_output_grad;
-  T *output_grad;
-  T *prev_out_grad;
-};
-
-template <typename DeviceContext, typename T>
-struct GRUUnitFunctor {
-  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
-                      int frame_size, int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate);
-};
-
-template <typename DeviceContext, typename T>
-struct GRUUnitGradFunctor {
-  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
-                      GRUMetaGrad<T> grad, int frame_size, int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/im2col.cc b/paddle/operators/math/im2col.cc
deleted file mode 100644
index c2633b2e16434558d16f699a701e7b8cf1de8342..0000000000000000000000000000000000000000
--- a/paddle/operators/math/im2col.cc
+++ /dev/null
@@ -1,313 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/im2col.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [input_channels, filter_height, filter_width, output_height, output_width]
- */
-template <class T>
-class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& im, const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding, framework::Tensor* col) {
-    PADDLE_ENFORCE(im.dims().size() == 3);
-    PADDLE_ENFORCE(col->dims().size() == 5);
-
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[1];
-    int filter_width = col->dims()[2];
-    int col_height = col->dims()[3];
-    int col_width = col->dims()[4];
-
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       ((dilation[0] * (filter_height - 1) + 1))) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       ((dilation[1] * (filter_width - 1) + 1))) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-
-    int channels_col = im_channels * filter_height * filter_width;
-
-    const T* im_data = im.data<T>();
-    T* col_data = col->data<T>();
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / (filter_width * filter_height);
-      for (int h = 0; h < col_height; ++h) {
-        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-        for (int w = 0; w < col_width; ++w) {
-          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-          int col_idx = (c * col_height + h) * col_width + w;
-          int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
-
-          col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
-                               im_col_idx < 0 || im_col_idx >= im_width)
-                                  ? static_cast<T>(0)
-                                  : im_data[im_idx];
-        }
-      }
-    }
-  }
-};
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [input_channels, filter_height, filter_width, output_height, output_width]
- */
-template <class T>
-class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& col,
-                  const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding, framework::Tensor* im) {
-    PADDLE_ENFORCE(im->dims().size() == 3);
-    PADDLE_ENFORCE(col.dims().size() == 5);
-    int im_channels = im->dims()[0];
-    int im_height = im->dims()[1];
-    int im_width = im->dims()[2];
-    int filter_height = col.dims()[1];
-    int filter_width = col.dims()[2];
-    int col_height = col.dims()[3];
-    int col_width = col.dims()[4];
-
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       ((dilation[0] * (filter_height - 1) + 1))) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       ((dilation[1] * (filter_width - 1) + 1))) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-
-    int channels_col = im_channels * filter_height * filter_width;
-
-    T* im_data = im->data<T>();
-    const T* col_data = col.data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / (filter_width * filter_height);
-      for (int h = 0; h < col_height; ++h) {
-        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-        for (int w = 0; w < col_width; ++w) {
-          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-          if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
-              (im_col_idx) >= 0 && (im_col_idx) < im_width) {
-            im_data[(im_row_idx + c_im * im_height) * im_width + im_col_idx] +=
-                col_data[(c * col_height + h) * col_width + w];
-          }
-        }
-      }
-    }
-  }
-};
-
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUDeviceContext, float>;
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUDeviceContext, double>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUDeviceContext, float>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUDeviceContext, double>;
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [output_height, output_width, input_channels, filter_height, filter_width]
- */
-template <class T>
-class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& im, const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding, framework::Tensor* col) {
-    PADDLE_ENFORCE(im.dims().size() == 3);
-    PADDLE_ENFORCE(col->dims().size() == 5);
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[3];
-    int filter_width = col->dims()[4];
-    int col_height = col->dims()[0];
-    int col_width = col->dims()[1];
-
-    PADDLE_ENFORCE_EQ(
-        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
-        col_height,
-        "Output_height and padding(padding_up, padding_down) are "
-        "inconsistent.");
-    PADDLE_ENFORCE_EQ(
-        (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
-        col_width,
-        "col_width and padding(padding_left, padding_right) are "
-        "inconsistent.");
-
-    const T* im_data = im.data<T>();
-    T* col_data = col->data<T>();
-
-    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
-      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
-        for (int channel = 0; channel < im_channels; ++channel) {
-          for (int filter_row_idx = 0; filter_row_idx < filter_height;
-               ++filter_row_idx) {
-            int im_row_offset =
-                col_row_idx * stride[0] + filter_row_idx - padding[0];
-            for (int filter_col_idx = 0; filter_col_idx < filter_width;
-                 ++filter_col_idx) {
-              int im_col_offset =
-                  col_col_idx * stride[1] + filter_col_idx - padding[1];
-
-              int col_offset =
-                  ((((col_row_idx)*col_width + col_col_idx) * im_channels +
-                    channel) *
-                       filter_height +
-                   filter_row_idx) *
-                      filter_width +
-                  filter_col_idx;
-
-              int im_offset = (channel * im_height + im_row_offset) * im_width +
-                              im_col_offset;
-              col_data[col_offset] =
-                  (im_row_offset < 0 || im_row_offset >= im_height ||
-                   im_col_offset < 0 || im_col_offset >= im_width)
-                      ? static_cast<T>(0)
-                      : im_data[im_offset];
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [output_height, output_width, input_channels, filter_height, filter_width]
- */
-template <class T>
-class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& col,
-                  const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding, framework::Tensor* im) {
-    PADDLE_ENFORCE(im->dims().size() == 3);
-    PADDLE_ENFORCE(col.dims().size() == 5);
-    int im_channels = im->dims()[0];
-    int im_height = im->dims()[1];
-    int im_width = im->dims()[2];
-    int filter_height = col.dims()[3];
-    int filter_width = col.dims()[4];
-    int col_height = col.dims()[0];
-    int col_width = col.dims()[1];
-
-    PADDLE_ENFORCE_EQ(
-        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
-        col_height,
-        "Output_height and padding(padding_up, padding_down) are "
-        "inconsistent.");
-    PADDLE_ENFORCE_EQ(
-        (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
-        col_width,
-        "col_width and padding(padding_left, padding_right) are "
-        "inconsistent.");
-
-    T* im_data = im->data<T>();
-    const T* col_data = col.data<T>();
-
-    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
-      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
-        for (int channel = 0; channel < im_channels; ++channel) {
-          for (int filter_row_idx = 0; filter_row_idx < filter_height;
-               ++filter_row_idx) {
-            int im_row_offset =
-                col_row_idx * stride[0] + filter_row_idx - padding[0];
-            for (int filter_col_idx = 0; filter_col_idx < filter_width;
-                 ++filter_col_idx) {
-              int im_col_offset =
-                  col_col_idx * stride[1] + filter_col_idx - padding[1];
-
-              int col_offset =
-                  (((col_row_idx * col_width + col_col_idx) * im_channels +
-                    channel) *
-                       filter_height +
-                   filter_row_idx) *
-                      filter_width +
-                  filter_col_idx;
-
-              if (im_row_offset >= 0 && im_row_offset < im_height &&
-                  im_col_offset >= 0 && im_col_offset < im_width) {
-                int im_offset =
-                    (channel * im_height + im_row_offset) * im_width +
-                    im_col_offset;
-                im_data[im_offset] += col_data[col_offset];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUDeviceContext, float>;
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUDeviceContext, double>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUDeviceContext, float>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/im2col.cu b/paddle/operators/math/im2col.cu
deleted file mode 100644
index a88e837b030f286cce272f99ad7991c70336e4a9..0000000000000000000000000000000000000000
--- a/paddle/operators/math/im2col.cu
+++ /dev/null
@@ -1,424 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/im2col.h"
-#include "paddle/platform/cuda_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <class T>
-__global__ void im2col(const T* data_im, int num_outs, int im_height,
-                       int im_width, int dilation_h, int dilation_w,
-                       int filter_height, int filter_width, int stride_height,
-                       int stride_width, int padding_height, int padding_width,
-                       int col_height, int col_width, T* data_col) {
-  const int index =
-      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < num_outs) {
-    int w_out = index % col_width;
-    int h_out = (index / col_width) % col_height;
-    int channel_in = index / col_width / col_height;
-    int channel_out = channel_in * filter_height * filter_width;
-    int h_in = h_out * stride_height - padding_height;
-    int w_in = w_out * stride_width - padding_width;
-
-    data_col += (channel_out * col_height + h_out) * col_width + w_out;
-    data_im += (channel_in * im_height + h_in) * im_width + w_in;
-    for (int i = 0; i < filter_height; ++i) {
-      for (int j = 0; j < filter_width; ++j) {
-        int rIdx = h_in + i * dilation_h;
-        int cIdx = w_in + j * dilation_w;
-        *data_col =
-            (rIdx >= im_height || rIdx < 0 || cIdx >= im_width || cIdx < 0)
-                ? 0
-                : data_im[i * dilation_h * im_width + j * dilation_w];
-        data_col += col_height * col_width;
-      }
-    }
-  }
-}
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [input_channels, filter_height, filter_width, output_height, output_width]
- */
-template <class T>
-class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& im, const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding, framework::Tensor* col) {
-    PADDLE_ENFORCE(im.dims().size() == 3);
-    PADDLE_ENFORCE(col->dims().size() == 5);
-
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[1];
-    int filter_width = col->dims()[2];
-    int col_height = col->dims()[3];
-    int col_width = col->dims()[4];
-
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       (dilation[0] * (filter_height - 1) + 1)) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
-
-    int num_outputs = im_channels * col_height * col_width;
-    int blocks = (num_outputs + 1024 - 1) / 1024;
-    int block_x = 512;
-    int block_y = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(block_x, block_y);
-    im2col<T><<<grid, threads, 0, context.stream()>>>(
-        im.data<T>(), num_outputs, im_height, im_width, dilation[0],
-        dilation[1], filter_height, filter_width, stride[0], stride[1],
-        padding[0], padding[1], col_height, col_width, col->data<T>());
-  }
-};
-
-template <class T>
-__global__ void col2im(int n, const T* data_col, int im_height, int im_width,
-                       int dilation_h, int dilation_w, int filter_height,
-                       int filter_width, int stride_height, int stride_width,
-                       int padding_height, int padding_width, int col_height,
-                       int col_width, T* data_im) {
-  const int index =
-      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  const int d_filter_height = dilation_h * (filter_height - 1) + 1;
-  const int d_filter_width = dilation_w * (filter_width - 1) + 1;
-
-  if (index < n) {
-    T val = 0;
-    int w = index % im_width + padding_width;
-    int h = (index / im_width) % im_height + padding_height;
-    int c = index / (im_width * im_height);
-
-    // compute the start and end of the output
-    int w_col_start =
-        (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1;
-    int w_col_end = min(w / stride_width + 1, col_width);
-    int h_col_start =
-        (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1;
-    int h_col_end = min(h / stride_height + 1, col_height);
-
-    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-        int h_off = (h - h_col * stride_height);
-        int w_off = (w - w_col * stride_width);
-        if (h_off % dilation_h == 0 && w_off % dilation_w == 0) {
-          h_off /= dilation_h;
-          w_off /= dilation_w;
-          int data_col_index =
-              (((c * filter_height + h_off) * filter_width + w_off) *
-                   col_height +
-               h_col) *
-                  col_width +
-              w_col;
-
-          val += data_col[data_col_index];
-        }
-      }
-    }
-    data_im[index] = val;
-  }
-}
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [input_channels, filter_height, filter_width, output_height, output_width]
- */
-template <class T>
-class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& col,
-                  const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding, framework::Tensor* im) {
-    PADDLE_ENFORCE(im->dims().size() == 3);
-    PADDLE_ENFORCE(col.dims().size() == 5);
-
-    int im_channels = im->dims()[0];
-    int im_height = im->dims()[1];
-    int im_width = im->dims()[2];
-    int filter_height = col.dims()[1];
-    int filter_width = col.dims()[2];
-    int col_height = col.dims()[3];
-    int col_width = col.dims()[4];
-
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       (dilation[0] * (filter_height - 1) + 1)) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
-
-    size_t num_kernels = im_channels * im_height * im_width;
-
-    size_t blocks = (num_kernels + 1024 - 1) / 1024;
-    size_t block_x = 512;
-    size_t block_y = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(block_x, block_y);
-
-    // To avoid involving atomic operations, we will launch one kernel per
-    // bottom dimension, and then in the kernel add up the top dimensions.
-    col2im<T><<<grid, threads, 0, context.stream()>>>(
-        num_kernels, col.data<T>(), im_height, im_width, dilation[0],
-        dilation[1], filter_height, filter_width, stride[0], stride[1],
-        padding[0], padding[2], col_height, col_width, im->data<T>());
-  }
-};
-
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CUDADeviceContext, float>;
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CUDADeviceContext, double>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CUDADeviceContext, float>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CUDADeviceContext, double>;
-
-template <class T>
-__global__ void im2colOCF(const T* im_data, int im_channels, int im_height,
-                          int im_width, int filter_height, int filter_width,
-                          int stride_height, int stride_width,
-                          int padding_height, int padding_width, int col_height,
-                          int col_width, T* col_data) {
-  int swid = blockIdx.x;
-  int shid = blockIdx.y;
-  for (int channelid = threadIdx.z; channelid < im_channels;
-       channelid += blockDim.z) {
-    for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
-      for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
-        int width_offset = idx + swid * stride_width - padding_width;
-        int height_offset = idy + shid * stride_height - padding_height;
-        int im_offset = width_offset + height_offset * im_width +
-                        channelid * im_height * im_width;
-
-        int col_offset = idx + idy * filter_width +
-                         channelid * filter_height * filter_width +
-                         (shid * col_width + swid) *
-                             (im_channels * filter_height * filter_width);
-
-        col_data[col_offset] =
-            (height_offset >= im_height || height_offset < 0 ||
-             width_offset >= im_width || width_offset < 0)
-                ? T(0)
-                : im_data[im_offset];
-      }
-    }
-  }
-}
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [output_height, output_width, input_channels, filter_height, filter_width]
- */
-template <class T>
-class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& im, const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding, framework::Tensor* col) {
-    PADDLE_ENFORCE(im.dims().size() == 3);
-    PADDLE_ENFORCE(col->dims().size() == 5);
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[3];
-    int filter_width = col->dims()[4];
-    int col_height = col->dims()[0];
-    int col_width = col->dims()[1];
-
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       (dilation[0] * (filter_height - 1) + 1)) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
-
-    int block_dim_x = 0;
-    int block_dim_y = 0;
-    if (filter_height <= 4 && filter_width <= 4) {
-      block_dim_x = 4;
-      block_dim_y = 4;
-    } else if (filter_height <= 8 && filter_width <= 8) {
-      block_dim_x = 8;
-      block_dim_y = 8;
-    } else if (filter_height <= 16 && filter_width <= 16) {
-      block_dim_x = 16;
-      block_dim_y = 16;
-    } else {
-      block_dim_x = 32;
-      block_dim_y = 32;
-    }
-
-    int block_dim_z = 1024 / block_dim_x / block_dim_y;
-    dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
-    dim3 grid(col_width, col_height);
-    im2colOCF<T><<<grid, threads, 0, context.stream()>>>(
-        im.data<T>(), im_channels, im_height, im_width, filter_height,
-        filter_width, stride[0], stride[1], padding[0], padding[1], col_height,
-        col_width, col->data<T>());
-  }
-};
-
-template <class T>
-__global__ void col2imOCF(const T* col_data, int im_channels, int im_height,
-                          int im_width, int filter_height, int filter_width,
-                          int stride_height, int stride_width,
-                          int padding_height, int padding_width, int col_height,
-                          int col_width, T* im_data) {
-  int swid = blockIdx.x;
-  int shid = blockIdx.y;
-  for (int channelid = threadIdx.z; channelid < im_channels;
-       channelid += blockDim.z) {
-    for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
-      for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
-        int width_offset = idx + swid * stride_width - padding_width;
-        int height_offset = idy + shid * stride_height - padding_height;
-        int im_offset = width_offset + height_offset * im_width +
-                        channelid * im_height * im_width;
-
-        int col_offset = idx + idy * filter_width +
-                         channelid * filter_height * filter_width +
-                         (shid * col_width + swid) *
-                             (im_channels * filter_height * filter_width);
-
-        if (height_offset >= 0 && height_offset < im_height &&
-            width_offset >= 0 && width_offset < im_width) {
-          paddle::platform::CudaAtomicAdd(im_data + im_offset,
-                                          col_data[col_offset]);
-        }
-      }
-    }
-  }
-}
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [output_height, output_width, input_channels, filter_height, filter_width]
- */
-template <class T>
-class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& col,
-                  const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding, framework::Tensor* im) {
-    PADDLE_ENFORCE(im->dims().size() == 3);
-    PADDLE_ENFORCE(col.dims().size() == 5);
-    int im_channels = im->dims()[0];
-    int im_height = im->dims()[1];
-    int im_width = im->dims()[2];
-    int filter_height = col.dims()[3];
-    int filter_width = col.dims()[4];
-    int col_height = col.dims()[0];
-    int col_width = col.dims()[1];
-
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       (dilation[0] * (filter_height - 1) + 1)) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
-
-    int block_dim_x = 0;
-    int block_dim_y = 0;
-    if (filter_height <= 4 && filter_width <= 4) {
-      block_dim_x = 4;
-      block_dim_y = 4;
-    } else if (filter_height <= 8 && filter_width <= 8) {
-      block_dim_x = 8;
-      block_dim_y = 8;
-    } else if (filter_height <= 16 && filter_width <= 16) {
-      block_dim_x = 16;
-      block_dim_y = 16;
-    } else {
-      block_dim_x = 32;
-      block_dim_y = 32;
-    }
-
-    int block_dim_z = 1024 / block_dim_x / block_dim_y;
-    dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
-    dim3 grid(col_width, col_height);
-    col2imOCF<T><<<grid, threads, 0, context.stream()>>>(
-        col.data<T>(), im_channels, im_height, im_width, filter_height,
-        filter_width, stride[0], stride[1], padding[0], padding[1], col_height,
-        col_width, im->data<T>());
-  }
-};
-
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CUDADeviceContext, float>;
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CUDADeviceContext, double>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CUDADeviceContext, float>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/im2col.h b/paddle/operators/math/im2col.h
deleted file mode 100644
index 38f2c9fe0adf80a2a4355a45bebb9ba0f341d1ab..0000000000000000000000000000000000000000
--- a/paddle/operators/math/im2col.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/tensor.h"
-#include "paddle/framework/tensor_util.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
-enum class ColFormat { kCFO = 0, kOCF = 1 };
-
-/*
- * \brief Converts the image data of three dimensions(CHW) into a colData of
- *        five dimensions in the Im2ColFunctor calculation,
- *        And in the Col2ImFunctor calculation, it is reversed.
- *
- * \param imData   Image data.
- * \param imShape  The shape of imData,
- *                 [input_channels, input_height, input_width].
- * \param colData  Column data.
- * \param colShape The shape of colData.
- *
- * \param dilations    dilation data.
- * \param 2-dimension  [dilation_height, dilation_width].
- *
- * \param strides      stride data.
- * \param 2-dimension  [stride_height, stride_width].
- *
- * \param paddings     padding data.
- * \param 4-dimension  [up_pad, left_pad, down_pad, right_pad].
- *
- * If the template argument Format is kCFO, the shape of colData is:
- * [input_channels, filter_height, filter_width, output_height, output_width]
- * So, it is easy to reshape into a convolution matrix for convolution
- * calculation based on matrix multiplication.
- * The shape of convolution matrix is [height, width], where the height is equal
- * input_channels * filter_height * filter_width, and the width is equal
- * output_height * output_width.
- *
- * Reshape:
- *     shape of colData           shape of convolution matrix
- *     [input_channels,
- *      filter_height,
- *      filter_width,      ======>      [height, width]
- *      output_height,
- *      output_width]
- *
- * If the template argument Format is kOCF, the shape of colData is:
- * [output_height, output_width, input_channels, filter_height, filter_width]
- * So, it is easy to reshape into a sequence matrix for rnn calculation.
- * The shape of sequence matrix is [seq_length, step_size], where the seq_length
- * is equal output_height * output_width, and the step_size is equal
- * input_channels * filter_height * filter_width.
- *
- * Reshape:
- *     shape of colData             shape of sequence matrix
- *     [output_height,
- *      output_width,
- *      input_channels,    ======>    [seqLength, stepSize]
- *      filter_height,
- *      filter_width]
- *
- * \note The caller needs to ensure that imShape.inputChannels is equal to
- *       colShape.inputChannels.
- */
-template <ColFormat Format, typename DeviceContext, typename T>
-class Im2ColFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& im,
-                  const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding, framework::Tensor* col);
-};
-
-template <ColFormat Format, typename DeviceContext, typename T>
-class Col2ImFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& col,
-                  const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding, framework::Tensor* im);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc
deleted file mode 100644
index 1ba24325ffe3922081f59abfc1c67c95b514bcfa..0000000000000000000000000000000000000000
--- a/paddle/operators/math/im2col_test.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/im2col.h"
-#include <gtest/gtest.h>
-
-template <typename DeviceContext, typename Place>
-void testIm2col() {
-  paddle::framework::Tensor input_tmp;
-  paddle::framework::Tensor input;
-  paddle::framework::Tensor output_cfo;
-  paddle::framework::Tensor output_ocf;
-  paddle::framework::Tensor output_tmp;
-
-  /**
-   * input = [0, 1, 2,
-   *          3, 4, 5]
-   *
-   * output_cfo = [0, 1
-   *               1, 2
-   *               3, 4
-   *               4, 5]
-   *
-   * output_ocf = [0, 1, 3, 4
-   *               1, 2, 4, 5]
-   *
-   * col2im_cfo = [0, 2, 2
-   *               3, 4, 5]
-   *
-   * col2im_ocf = [0, 2, 2
-   *               3, 4, 5]
-   */
-  int input_height = 2;
-  int input_width = 3;
-  int filter_size = 2;
-  std::vector<int> stride({1, 1});  // stride_y, stride_x
-  std::vector<int> padding(
-      {0, 0, 0, 0});                  // up_pad, left_pad, down_pad, right_pad
-  std::vector<int> dilation({1, 1});  // dilation_y, dilation_x
-  int output_height =
-      (input_height - filter_size + padding[0] + padding[1]) / stride[0] + 1;
-  int output_width =
-      (input_width - filter_size + padding[2] + padding[3]) / stride[1] + 1;
-  float* input_ptr = input_tmp.mutable_data<float>(
-      {1, input_height, input_width}, paddle::platform::CPUPlace());
-  float arr[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input_ptr, arr, 6 * sizeof(float));
-
-  auto* place = new Place();
-  DeviceContext* context = new DeviceContext(*place);
-  if (paddle::platform::is_cpu_place(*place)) {
-    input = input_tmp;
-  } else {
-    Copy(input_tmp, *place, *context, &input);
-  }
-  output_cfo.mutable_data<float>(
-      {1, filter_size, filter_size, output_height, output_width}, *place);
-  output_ocf.mutable_data<float>(
-      {output_height, output_width, 1, filter_size, filter_size}, *place);
-
-  // Im2Col
-  paddle::operators::math::Im2ColFunctor<
-      paddle::operators::math::ColFormat::kCFO, DeviceContext, float>
-      im2col;
-  paddle::operators::math::Im2ColFunctor<
-      paddle::operators::math::ColFormat::kOCF, DeviceContext, float>
-      im2col_ocf;
-
-  im2col(*context, input, dilation, stride, padding, &output_cfo);
-  im2col_ocf(*context, input, dilation, stride, padding, &output_ocf);
-
-  float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5};
-  float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5};
-
-  float* out_cfo_ptr;
-  if (paddle::platform::is_cpu_place(*place)) {
-    out_cfo_ptr = output_cfo.data<float>();
-  } else {
-    Copy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp);
-    out_cfo_ptr = output_tmp.data<float>();
-  }
-  for (int i = 0; i < 6; ++i) {
-    EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]);
-  }
-
-  float* out_ocf_ptr;
-  if (paddle::platform::is_cpu_place(*place)) {
-    out_ocf_ptr = output_ocf.data<float>();
-  } else {
-    Copy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp);
-    out_ocf_ptr = output_tmp.data<float>();
-  }
-
-  for (int i = 0; i < 6; ++i) {
-    EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]);
-  }
-
-  // Col2Im: kCFO
-  paddle::operators::math::Col2ImFunctor<
-      paddle::operators::math::ColFormat::kCFO, DeviceContext, float>
-      col2im;
-  paddle::operators::math::Col2ImFunctor<
-      paddle::operators::math::ColFormat::kOCF, DeviceContext, float>
-      col2im_ocf;
-  float col2im_data[] = {0, 2, 2, 3, 8, 5};
-
-  memset(input_ptr, 0, 6 * sizeof(float));
-  if (paddle::platform::is_cpu_place(*place)) {
-    input = input_tmp;
-  } else {
-    Copy(input_tmp, *place, *context, &input);
-  }
-
-  col2im(*context, output_cfo, dilation, stride, padding, &input);
-
-  float* in_ptr;
-  if (paddle::platform::is_cpu_place(*place)) {
-    in_ptr = input.data<float>();
-  } else {
-    Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
-    in_ptr = input_tmp.data<float>();
-  }
-  for (int i = 0; i < 6; ++i) {
-    EXPECT_EQ(in_ptr[i], col2im_data[i]);
-  }
-
-  // Col2Im: kOCF
-  memset(input_ptr, 0, 6 * sizeof(float));
-  if (paddle::platform::is_cpu_place(*place)) {
-    input = input_tmp;
-  } else {
-    Copy(input_tmp, *place, *context, &input);
-  }
-
-  col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
-
-  if (paddle::platform::is_cpu_place(*place)) {
-    in_ptr = input.data<float>();
-  } else {
-    Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
-    in_ptr = input_tmp.data<float>();
-  }
-  for (int i = 0; i < 6; ++i) {
-    EXPECT_EQ(in_ptr[i], col2im_data[i]);
-  }
-
-  delete place;
-  delete context;
-}
-
-TEST(math, im2col) {
-  testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
-#ifdef PADDLE_WITH_CUDA
-  testIm2col<paddle::platform::CUDADeviceContext,
-             paddle::platform::CUDAPlace>();
-#endif
-}
diff --git a/paddle/operators/math/lstm_compute.cc b/paddle/operators/math/lstm_compute.cc
deleted file mode 100644
index d453102ecefc9d79e1f4474ba94be0eb69a87c85..0000000000000000000000000000000000000000
--- a/paddle/operators/math/lstm_compute.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/lstm_compute.h"
-#include "paddle/operators/math/detail/lstm_cpu_kernel.h"
-#include "paddle/operators/math/detail/lstm_kernel.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <class T>
-struct LstmUnitFunctor<platform::CPUDeviceContext, T> {
-  static void compute(const platform::CPUDeviceContext& context,
-                      LstmMetaValue<T> value, int frame_size, int batch_size,
-                      const detail::ActivationType& gate_act,
-                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act) {
-    for (int b = 0; b < batch_size; b++) {
-      detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
-                               cand_act, gate_act, cell_act);
-      value.gate_value += frame_size * 4;
-      value.state_value += frame_size;
-      value.state_active_value += frame_size;
-      value.output_value += frame_size;
-      if (value.prev_state_value) {
-        value.prev_state_value += frame_size;
-      }
-    }
-  }
-};
-
-template <class T>
-struct LstmUnitGradFunctor<platform::CPUDeviceContext, T> {
-  static void compute(const platform::CPUDeviceContext& context,
-                      LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                      int frame_size, int batch_size,
-                      const detail::ActivationType& gate_act,
-                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act) {
-    for (int b = 0; b < batch_size; b++) {
-      detail::cpu_lstm_backward(detail::backward::lstm<T>(), value, grad,
-                                frame_size, cand_act, gate_act, cell_act);
-
-      value.gate_value += frame_size * 4;
-      value.state_value += frame_size;
-      value.state_active_value += frame_size;
-      value.output_value += frame_size;
-      if (value.prev_state_value) {
-        value.prev_state_value += frame_size;
-      }
-
-      grad.gate_grad += frame_size * 4;
-      grad.state_grad += frame_size;
-      grad.state_active_grad += frame_size;
-      grad.output_grad += frame_size;
-      if (grad.prev_state_grad) {
-        grad.prev_state_grad += frame_size;
-      }
-    }
-  }
-};
-
-template class LstmUnitFunctor<platform::CPUDeviceContext, float>;
-template class LstmUnitFunctor<platform::CPUDeviceContext, double>;
-template class LstmUnitGradFunctor<platform::CPUDeviceContext, float>;
-template class LstmUnitGradFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/lstm_compute.cu b/paddle/operators/math/lstm_compute.cu
deleted file mode 100644
index 82065d699f760db6cc86bf3d6c56e51c583c6ace..0000000000000000000000000000000000000000
--- a/paddle/operators/math/lstm_compute.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/detail/lstm_gpu_kernel.h"
-#include "paddle/operators/math/detail/lstm_kernel.h"
-#include "paddle/operators/math/lstm_compute.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <class T>
-struct LstmUnitFunctor<platform::CUDADeviceContext, T> {
-  static void compute(const platform::CUDADeviceContext& context,
-                      LstmMetaValue<T> value, int frame_size, int batch_size,
-                      const detail::ActivationType& gate_act,
-                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act) {
-    detail::gpu_lstm_forward<T>(context, detail::forward::lstm<T>(), value,
-                                frame_size, batch_size, cand_act, gate_act,
-                                cell_act);
-  }
-};
-
-template <class T>
-struct LstmUnitGradFunctor<platform::CUDADeviceContext, T> {
-  static void compute(const platform::CUDADeviceContext& context,
-                      LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                      int frame_size, int batch_size,
-                      const detail::ActivationType& gate_act,
-                      const detail::ActivationType& cell_act,
-                      const detail::ActivationType& cand_act) {
-    detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, grad,
-                              frame_size, batch_size, cand_act, gate_act,
-                              cell_act);
-  }
-};
-
-template class LstmUnitFunctor<platform::CUDADeviceContext, float>;
-template class LstmUnitFunctor<platform::CUDADeviceContext, double>;
-template class LstmUnitGradFunctor<platform::CUDADeviceContext, float>;
-template class LstmUnitGradFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/lstm_compute.h b/paddle/operators/math/lstm_compute.h
deleted file mode 100644
index e1ad6b64d201ef99d83eaa2a821356008dcc9c8e..0000000000000000000000000000000000000000
--- a/paddle/operators/math/lstm_compute.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/operators/math/detail/activation_functions.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <class T>
-struct LstmMetaValue {
-  T *gate_value;
-  T *prev_state_value;
-  T *state_value;
-  T *state_active_value;
-  T *output_value;
-  T *check_ig;
-  T *check_fg;
-  T *check_og;
-};
-
-template <class T>
-struct LstmMetaGrad {
-  T *gate_grad;
-  T *prev_state_grad;
-  T *state_grad;
-  T *state_active_grad;
-  T *output_grad;
-  T *check_ig_grad;
-  T *check_fg_grad;
-  T *check_og_grad;
-};
-
-template <typename DeviceContext, typename T>
-class LstmUnitFunctor {
- public:
-  static void compute(const DeviceContext &context, LstmMetaValue<T> value,
-                      int frame_size, int batch_size,
-                      const detail::ActivationType &gate_act,
-                      const detail::ActivationType &cell_act,
-                      const detail::ActivationType &cand_act);
-};
-
-template <typename DeviceContext, typename T>
-class LstmUnitGradFunctor {
- public:
-  static void compute(const DeviceContext &context, LstmMetaValue<T> value,
-                      LstmMetaGrad<T> grad, int frame_size, int batch_size,
-                      const detail::ActivationType &gate_act,
-                      const detail::ActivationType &cell_act,
-                      const detail::ActivationType &cand_act);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
deleted file mode 100644
index dcf4b85e1aadf88e4b1ca70ac7e8b5416fc58cd8..0000000000000000000000000000000000000000
--- a/paddle/operators/math/math_function.cc
+++ /dev/null
@@ -1,336 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/math_function.h"
-#include "paddle/framework/data_type.h"
-#include "paddle/operators/math/math_function_impl.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <>
-void gemm<platform::CPUDeviceContext, float>(
-    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float alpha, const float* A, const float* B, const float beta,
-    float* C) {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
-}
-
-template <>
-void gemm<platform::CPUDeviceContext, double>(
-    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const double* B, const double beta,
-    double* C) {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
-}
-
-template <>
-void gemm<platform::CPUDeviceContext, float>(
-    const platform::CPUDeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K, const float alpha,
-    const float* A, const int lda, const float* B, const int ldb,
-    const float beta, float* C, const int ldc) {
-  cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-              lda, B, ldb, beta, C, ldc);
-}
-
-template <>
-void gemm<platform::CPUDeviceContext, double>(
-    const platform::CPUDeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const int lda, const double* B,
-    const int ldb, const double beta, double* C, const int ldc) {
-  cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-              lda, B, ldb, beta, C, ldc);
-}
-
-template <>
-void matmul<platform::CPUDeviceContext, float>(
-    const platform::CPUDeviceContext& context,
-    const framework::Tensor& matrix_a, bool trans_a,
-    const framework::Tensor& matrix_b, bool trans_b, float alpha,
-    framework::Tensor* matrix_out, float beta) {
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-                 "The input and output of matmul be matrix");
-
-  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
-                     platform::is_cpu_place(matrix_b.place()) &&
-                     platform::is_cpu_place(matrix_out->place()),
-                 "Matrix must all be in CPUPlace");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
-
-  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-  gemm<platform::CPUDeviceContext, float>(
-      context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
-      matrix_b.data<float>(), beta, matrix_out->data<float>());
-}
-
-template <>
-void matmul<platform::CPUDeviceContext, double>(
-    const platform::CPUDeviceContext& context,
-    const framework::Tensor& matrix_a, bool trans_a,
-    const framework::Tensor& matrix_b, bool trans_b, double alpha,
-    framework::Tensor* matrix_out, double beta) {
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-                 "The input and output of matmul be matrix");
-
-  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
-                     platform::is_cpu_place(matrix_b.place()) &&
-                     platform::is_cpu_place(matrix_out->place()),
-                 "Matrix must all be in CPUPlace");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
-
-  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-  gemm<platform::CPUDeviceContext, double>(
-      context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
-      matrix_b.data<double>(), beta, matrix_out->data<double>());
-}
-
-#ifdef PADDLE_WITH_MKLML
-// Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize.
-template <>
-void batched_gemm<platform::CPUDeviceContext, float>(
-    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float alpha, const float* A, const float* B, const float beta,
-    float* C, const int batchCount, const int strideA, const int strideB) {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  auto a_array = std::vector<const float*>(batchCount);
-  auto b_array = std::vector<const float*>(batchCount);
-  auto c_array = std::vector<float*>(batchCount);
-  for (int k = 0; k < batchCount; ++k) {
-    a_array[k] = &A[k * strideA];
-    b_array[k] = &B[k * strideB];
-    c_array[k] = &C[k * M * N];
-  }
-  cblas_sgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha,
-                    a_array.data(), &lda, b_array.data(), &ldb, &beta,
-                    c_array.data(), &ldc, 1 /* group_count */, &batchCount);
-}
-
-template <>
-void batched_gemm<platform::CPUDeviceContext, double>(
-    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const double* B, const double beta,
-    double* C, const int batchCount, const int strideA, const int strideB) {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  auto a_array = std::vector<const double*>(batchCount);
-  auto b_array = std::vector<const double*>(batchCount);
-  auto c_array = std::vector<double*>(batchCount);
-  for (int k = 0; k < batchCount; ++k) {
-    a_array[k] = &A[k * strideA];
-    b_array[k] = &B[k * strideB];
-    c_array[k] = &C[k * M * N];
-  }
-  cblas_dgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha,
-                    a_array.data(), &lda, b_array.data(), &ldb, &beta,
-                    c_array.data(), &ldc, 1 /* group_count */, &batchCount);
-}
-#else
-// The below is a naive but correct serial implementation that just loops
-// over the batch dimension. This is a fallback for when the batched gemm
-// functions of Intel MKL are not available. In the future, this computation
-// should be parallelized.
-template <>
-void batched_gemm<platform::CPUDeviceContext, float>(
-    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float alpha, const float* A, const float* B, const float beta,
-    float* C, const int batchCount, const int strideA, const int strideB) {
-  for (int k = 0; k < batchCount; ++k) {
-    const float* Ak = &A[k * strideA];
-    const float* Bk = &B[k * strideB];
-    float* Ck = &C[k * M * N];
-    gemm<platform::CPUDeviceContext, float>(context, transA, transB, M, N, K,
-                                            alpha, Ak, Bk, beta, Ck);
-  }
-}
-
-template <>
-void batched_gemm<platform::CPUDeviceContext, double>(
-    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const double* B, const double beta,
-    double* C, const int batchCount, const int strideA, const int strideB) {
-  for (int k = 0; k < batchCount; ++k) {
-    const double* Ak = &A[k * strideA];
-    const double* Bk = &B[k * strideB];
-    double* Ck = &C[k * M * N];
-    gemm<platform::CPUDeviceContext, double>(context, transA, transB, M, N, K,
-                                             alpha, Ak, Bk, beta, Ck);
-  }
-}
-#endif
-
-template <>
-void gemv<platform::CPUDeviceContext, float>(
-    const platform::CPUDeviceContext& context, const bool trans_a, const int M,
-    const int N, const float alpha, const float* A, const float* B,
-    const float beta, float* C) {
-  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-  cblas_sgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
-}
-
-template <>
-void gemv<platform::CPUDeviceContext, double>(
-    const platform::CPUDeviceContext& context, const bool trans_a, const int M,
-    const int N, const double alpha, const double* A, const double* B,
-    const double beta, double* C) {
-  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-  cblas_dgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
-}
-
-template <>
-void axpy<platform::CPUDeviceContext, float>(
-    const platform::CPUDeviceContext& context, const int n, const float alpha,
-    const float* x, float* y) {
-  cblas_saxpy(n, alpha, x, 1, y, 1);
-}
-
-template <>
-void axpy<platform::CPUDeviceContext, double>(
-    const platform::CPUDeviceContext& context, const int n, const double alpha,
-    const double* x, double* y) {
-  cblas_daxpy(n, alpha, x, 1, y, 1);
-}
-
-template struct SetConstant<platform::CPUDeviceContext, float>;
-template struct SetConstant<platform::CPUDeviceContext, double>;
-template struct SetConstant<platform::CPUDeviceContext, int>;
-template struct SetConstant<platform::CPUDeviceContext, int64_t>;
-template struct SetConstant<platform::CPUDeviceContext, bool>;
-
-#define DEFINE_CPU_TRANS(RANK)                                          \
-  template struct Transpose<platform::CPUDeviceContext, float, RANK>;   \
-  template struct Transpose<platform::CPUDeviceContext, double, RANK>;  \
-  template struct Transpose<platform::CPUDeviceContext, int, RANK>;     \
-  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
-  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;
-
-DEFINE_CPU_TRANS(1);
-DEFINE_CPU_TRANS(2);
-DEFINE_CPU_TRANS(3);
-DEFINE_CPU_TRANS(4);
-DEFINE_CPU_TRANS(5);
-DEFINE_CPU_TRANS(6);
-
-struct TensorSetConstantCPU {
-  TensorSetConstantCPU(framework::Tensor* tensor, float value)
-      : tensor_(tensor), value_(value) {}
-  template <typename T>
-  void operator()() const {
-    auto cpu = platform::CPUPlace();
-    auto* begin = tensor_->mutable_data<T>(cpu);
-    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
-  }
-  framework::Tensor* tensor_;
-  float value_;
-};
-
-template <>
-void set_constant_with_place<platform::CPUPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  framework::VisitDataType(framework::ToDataType(tensor->type()),
-                           TensorSetConstantCPU(tensor, value));
-}
-
-struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
-  TensorSetConstantWithPlace(const platform::DeviceContext& context,
-                             framework::Tensor* tensor, float value)
-      : context_(context), tensor_(tensor), value_(value) {}
-
-  template <typename Place>
-  void operator()(Place place) const {
-    set_constant_with_place<Place>(context_, tensor_, value_);
-  }
-
-  const platform::DeviceContext& context_;
-  framework::Tensor* tensor_;
-  float value_;
-};
-
-void set_constant(const platform::DeviceContext& context,
-                  framework::Tensor* tensor, float value) {
-  TensorSetConstantWithPlace func(context, tensor, value);
-#ifdef PADDLE_WITH_CUDA
-  tensor->place().apply_visitor(func);
-#else
-  func(platform::CPUPlace());
-#endif
-}
-
-template <typename T>
-struct RowwiseAdd<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& vector, framework::Tensor* output) {
-    auto in_dims = input.dims();
-    auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(vector.numel(), size);
-    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
-
-    auto in = framework::EigenMatrix<T>::From(input);
-    auto vec = framework::EigenVector<T>::Flatten(vector);
-    auto out = framework::EigenMatrix<T>::From(*output);
-
-    for (int64_t i = 0; i < in_dims[0]; ++i) {
-      out.chip(i, 0) = in.chip(i, 0) + vec;
-    }
-  }
-};
-
-template struct RowwiseAdd<platform::CPUDeviceContext, float>;
-template struct RowwiseAdd<platform::CPUDeviceContext, double>;
-
-template struct ColwiseSum<platform::CPUDeviceContext, float>;
-template struct ColwiseSum<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
deleted file mode 100644
index d47a7f818ded61baf31e46ea3b8ae3101324111f..0000000000000000000000000000000000000000
--- a/paddle/operators/math/math_function.cu
+++ /dev/null
@@ -1,330 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/framework/data_type.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/math_function_impl.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <>
-void gemm<platform::CUDADeviceContext, float>(
-    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float alpha, const float* A, const float* B, const float beta,
-    float* C) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  PADDLE_ENFORCE(platform::dynload::cublasSgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
-      lda, &beta, C, N));
-}
-
-template <>
-void gemm<platform::CUDADeviceContext, double>(
-    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const double* B, const double beta,
-    double* C) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  PADDLE_ENFORCE(platform::dynload::cublasDgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
-      lda, &beta, C, N));
-}
-
-template <>
-void gemm<platform::CUDADeviceContext, float>(
-    const platform::CUDADeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K, const float alpha,
-    const float* A, const int lda, const float* B, const int ldb,
-    const float beta, float* C, const int ldc) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-  PADDLE_ENFORCE(platform::dynload::cublasSgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
-      lda, &beta, C, ldc));
-}
-
-template <>
-void gemm<platform::CUDADeviceContext, double>(
-    const platform::CUDADeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const int lda, const double* B,
-    const int ldb, const double beta, double* C, const int ldc) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-  PADDLE_ENFORCE(platform::dynload::cublasDgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
-      lda, &beta, C, ldc));
-}
-
-template <>
-void matmul<platform::CUDADeviceContext, float>(
-    const platform::CUDADeviceContext& context,
-    const framework::Tensor& matrix_a, bool trans_a,
-    const framework::Tensor& matrix_b, bool trans_b, float alpha,
-    framework::Tensor* matrix_out, float beta) {
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-                 "The input and output of matmul be matrix");
-
-  PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
-                     platform::is_gpu_place(matrix_b.place()) &&
-                     platform::is_gpu_place(matrix_out->place()),
-                 "Matrix must all be in CUDAPlace");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
-
-  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-  gemm<platform::CUDADeviceContext, float>(
-      context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
-      matrix_b.data<float>(), beta, matrix_out->data<float>());
-}
-
-template <>
-void matmul<platform::CUDADeviceContext, double>(
-    const platform::CUDADeviceContext& context,
-    const framework::Tensor& matrix_a, bool trans_a,
-    const framework::Tensor& matrix_b, bool trans_b, double alpha,
-    framework::Tensor* matrix_out, double beta) {
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-                 "The input and output of matmul be matrix");
-
-  PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
-                     platform::is_gpu_place(matrix_b.place()) &&
-                     platform::is_gpu_place(matrix_out->place()),
-                 "Matrix must all be in CUDAPlace");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
-
-  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-  gemm<platform::CUDADeviceContext, double>(
-      context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
-      matrix_b.data<double>(), beta, matrix_out->data<double>());
-}
-
-template <>
-void batched_gemm<platform::CUDADeviceContext, float>(
-    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float alpha, const float* A, const float* B, const float beta,
-    float* C, const int batchCount, const int strideA, const int strideB) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  const int strideC = M * N;
-
-  PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
-      strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
-}
-
-template <>
-void batched_gemm<platform::CUDADeviceContext, double>(
-    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const double* B, const double beta,
-    double* C, const int batchCount, const int strideA, const int strideB) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  const int strideC = M * N;
-
-  PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
-      strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
-}
-
-template <>
-void gemv<platform::CUDADeviceContext, float>(
-    const platform::CUDADeviceContext& context, const bool trans_a, const int M,
-    const int N, const float alpha, const float* A, const float* B,
-    const float beta, float* C) {
-  cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-  PADDLE_ENFORCE(platform::dynload::cublasSgemv(context.cublas_handle(),
-                                                cuTransA, N, M, &alpha, A, N, B,
-                                                1, &beta, C, 1));
-}
-
-template <>
-void gemv<platform::CUDADeviceContext, double>(
-    const platform::CUDADeviceContext& context, const bool trans_a, const int M,
-    const int N, const double alpha, const double* A, const double* B,
-    const double beta, double* C) {
-  cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  PADDLE_ENFORCE(platform::dynload::cublasDgemv(context.cublas_handle(),
-                                                cuTransA, N, M, &alpha, A, N, B,
-                                                1, &beta, C, 1));
-}
-
-template <>
-void axpy<platform::CUDADeviceContext, float>(
-    const platform::CUDADeviceContext& context, const int n, const float alpha,
-    const float* x, float* y) {
-  PADDLE_ENFORCE(platform::dynload::cublasSaxpy(context.cublas_handle(), n,
-                                                &alpha, x, 1, y, 1));
-}
-
-template <>
-void axpy<platform::CUDADeviceContext, double>(
-    const platform::CUDADeviceContext& context, const int n, const double alpha,
-    const double* x, double* y) {
-  PADDLE_ENFORCE(platform::dynload::cublasDaxpy(context.cublas_handle(), n,
-                                                &alpha, x, 1, y, 1));
-}
-
-template struct SetConstant<platform::CUDADeviceContext, float>;
-template struct SetConstant<platform::CUDADeviceContext, double>;
-template struct SetConstant<platform::CUDADeviceContext, int>;
-template struct SetConstant<platform::CUDADeviceContext, int64_t>;
-template struct SetConstant<platform::CUDADeviceContext, bool>;
-
-#define DEFINE_GPU_TRANS(RANK)                                         \
-  template struct Transpose<platform::CUDADeviceContext, float, RANK>; \
-  template struct Transpose<platform::CUDADeviceContext, double, RANK>;
-
-DEFINE_GPU_TRANS(1);
-DEFINE_GPU_TRANS(2);
-DEFINE_GPU_TRANS(3);
-DEFINE_GPU_TRANS(4);
-DEFINE_GPU_TRANS(5);
-DEFINE_GPU_TRANS(6);
-
-struct TensorSetConstantGPU {
-  TensorSetConstantGPU(const platform::DeviceContext& context,
-                       framework::Tensor* tensor, float value)
-      : context_(context), tensor_(tensor), value_(value) {}
-
-  template <typename T>
-  void operator()() const {
-    SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(reinterpret_cast<const platform::CUDADeviceContext&>(context_),
-            tensor_, static_cast<T>(value_));
-  }
-
-  const platform::DeviceContext& context_;
-  framework::Tensor* tensor_;
-  float value_;
-};
-
-template <>
-void set_constant_with_place<platform::CUDAPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  framework::VisitDataType(framework::ToDataType(tensor->type()),
-                           TensorSetConstantGPU(context, tensor, value));
-}
-
-template <typename T>
-__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width,
-                                 int num) {
-  T tmp = 1.0 / width;
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-       i += blockDim.x * gridDim.x) {
-    int h = i * tmp;
-    int w = i - h * width;
-    c[i] = a[i] + b[w];
-  }
-}
-
-template <typename T>
-struct RowwiseAdd<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& vector, framework::Tensor* output) {
-    auto in_dims = input.dims();
-    auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(vector.numel(), size);
-    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
-    int blocks = 512;
-    int grids = (input.numel() + blocks - 1) / blocks;
-    RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
-        input.data<T>(), vector.data<T>(), output->data<T>(),
-        static_cast<int>(in_dims[1]), static_cast<int>(input.numel()));
-  }
-};
-
-template struct RowwiseAdd<platform::CUDADeviceContext, float>;
-template struct RowwiseAdd<platform::CUDADeviceContext, double>;
-template struct ColwiseSum<platform::CUDADeviceContext, float>;
-// template struct ColwiseSum<platform::CUDADeviceContext, double>;
-// The ColwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
-// and only failed for this case. So reimplemented it.
-template <>
-void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
-    const platform::CUDADeviceContext& context, const framework::Tensor& input,
-    framework::Tensor* vector) {
-  auto in_dims = input.dims();
-  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), size);
-  framework::Tensor one;
-  one.mutable_data<double>({in_dims[0]}, context.GetPlace());
-  SetConstant<platform::CUDADeviceContext, double> set;
-  set(context, &one, static_cast<double>(1.0));
-  gemv<platform::CUDADeviceContext, double>(
-      context, true, static_cast<int>(in_dims[0]), static_cast<int>(in_dims[1]),
-      1.0, input.data<double>(), one.data<double>(), 0.0,
-      vector->data<double>());
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h
deleted file mode 100644
index 8cc03c2ba0facae691a0d2b8a4f2ea768cfa5491..0000000000000000000000000000000000000000
--- a/paddle/operators/math/math_function.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifdef PADDLE_WITH_MKLML
-#include <mkl_cblas.h>
-#include <mkl_lapacke.h>
-#include <mkl_vml_functions.h>
-#endif
-
-#ifdef PADDLE_USE_ATLAS
-extern "C" {
-#include <cblas.h>
-#include <clapack.h>
-}
-#endif
-
-#ifdef PADDLE_USE_OPENBLAS
-#include <cblas.h>
-#include <lapacke.h>
-#endif
-
-#ifndef LAPACK_FOUND
-extern "C" {
-#include <cblas.h>
-int LAPACKE_sgetrf(int matrix_layout, int m, int n, float* a, int lda,
-                   int* ipiv);
-int LAPACKE_dgetrf(int matrix_layout, int m, int n, double* a, int lda,
-                   int* ipiv);
-int LAPACKE_sgetri(int matrix_layout, int n, float* a, int lda,
-                   const int* ipiv);
-int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
-                   const int* ipiv);
-}
-#endif
-
-#include <cmath>
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/framework/tensor_util.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-// Support continuous memory now
-// If transA = N, and transB = N
-// Then matrixA: M * K, matrixB: K * N, matrixC : M * N
-// For more detailed info, please refer to
-// http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
-template <typename DeviceContext, typename T>
-void gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA,
-          const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-          const T alpha, const T* A, const T* B, const T beta, T* C);
-
-// gemm wrapper with stride args for matrix uncontinuous in memory
-template <typename DeviceContext, typename T>
-void gemm(const DeviceContext& context, const bool transA, const bool transB,
-          const int M, const int N, const int K, const T alpha, const T* A,
-          const int lda, const T* B, const int ldb, const T beta, T* C,
-          const int ldc);
-
-// matrix multiply with continuous memory
-template <typename DeviceContext, typename T>
-void matmul(const DeviceContext& context, const framework::Tensor& matrix_a,
-            bool trans_a, const framework::Tensor& matrix_b, bool trans_b,
-            T alpha, framework::Tensor* matrix_out, T beta);
-
-// Batched gemm
-template <typename DeviceContext, typename T>
-void batched_gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA,
-                  const CBLAS_TRANSPOSE transB, const int M, const int N,
-                  const int K, const T alpha, const T* A, const T* B,
-                  const T beta, T* C, const int batchCount, const int strideA,
-                  const int strideB);
-
-template <typename DeviceContext, typename T>
-void gemv(const DeviceContext& context, const bool trans_a, const int M,
-          const int N, const T alpha, const T* A, const T* B, const T beta,
-          T* C);
-
-template <typename DeviceContext, typename T>
-void axpy(const DeviceContext& context, const int n, const T alpha, const T* x,
-          T* y);
-
-template <typename DeviceContext, typename T, int Rank>
-struct Transpose {
-  void operator()(const DeviceContext& context, const framework::Tensor& in,
-                  framework::Tensor* out, const std::vector<int>& axis);
-};
-
-template <typename DeviceContext, typename T>
-struct SetConstant {
-  void operator()(const DeviceContext& context, framework::Tensor* tensor,
-                  T num);
-};
-
-template <typename Place>
-void set_constant_with_place(const platform::DeviceContext& context,
-                             framework::Tensor* tensor, float value);
-
-void set_constant(const platform::DeviceContext& context,
-                  framework::Tensor* tensor, float value);
-
-template <typename DeviceContext, typename T>
-struct RowwiseAdd {
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& vec, framework::Tensor* output);
-};
-
-template <typename DeviceContext, typename T>
-struct ColwiseSum {
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  framework::Tensor* vec);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h
deleted file mode 100644
index de591626df28e2bc3391b609f909612411398247..0000000000000000000000000000000000000000
--- a/paddle/operators/math/math_function_impl.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/data_type.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename DeviceContext, typename T>
-void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
-                                               framework::Tensor* tensor,
-                                               T num) {
-  auto t = framework::EigenVector<T>::Flatten(*tensor);
-  t.device(*context.eigen_device()) = t.constant(static_cast<T>(num));
-}
-
-template <typename DeviceContext, typename T, int Rank>
-void Transpose<DeviceContext, T, Rank>::operator()(
-    const DeviceContext& context, const framework::Tensor& in,
-    framework::Tensor* out, const std::vector<int>& axis) {
-  Eigen::array<int, Rank> permute;
-  for (int i = 0; i < Rank; i++) {
-    permute[i] = axis[i];
-  }
-  auto in_dim = in.dims();
-  auto out_dim = out->dims();
-
-  auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
-  auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
-  auto* dev = context.eigen_device();
-  eigen_out.device(*dev) = eigen_in.shuffle(permute);
-}
-
-template <typename DeviceContext, typename T>
-void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
-                                              const framework::Tensor& input,
-                                              framework::Tensor* out) {
-  auto in_dims = input.dims();
-  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(out->numel(), size);
-
-  auto in = framework::EigenMatrix<T>::From(input);
-  auto vec = framework::EigenVector<T>::Flatten(*out);
-
-  vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
-}
-
-// Specialize for CPU, since Eigen implement a general reduce. However,
-// colwise-sum can be easily implemented. General reduce has a huge overhead in
-// CPU
-template <typename T>
-class ColwiseSum<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    auto& in_dims = input.dims();
-    auto height = in_dims[0];
-    auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), size);
-
-    T* out_buf = out->mutable_data<T>(out->place());
-    const T* in_buf = input.data<T>();
-
-    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
-      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
-        if (i == 0) {
-          out_buf[j] = in_buf[i * size + j];
-        } else {
-          out_buf[j] += in_buf[i * size + j];
-        }
-      }
-    }
-  }
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc
deleted file mode 100644
index c9f322b92e5476d889b57bcc91a0ce1d9e5339d5..0000000000000000000000000000000000000000
--- a/paddle/operators/math/math_function_test.cc
+++ /dev/null
@@ -1,167 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/operators/math/math_function.h"
-#include "gtest/gtest.h"
-
-TEST(math_function, gemm_notrans_cblas) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
-
-  int m = 2;
-  int n = 3;
-  int k = 3;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
-  float arr1[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr1, 6 * sizeof(float));
-  float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
-  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  memcpy(input2_ptr, arr2, 12 * sizeof(float));
-  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
-  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  memcpy(input3_ptr, arr3, 8 * sizeof(float));
-
-  paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::gemm<paddle::platform::CPUDeviceContext, float>(
-      context, false, false, m, n, k, 1, input1_ptr, 3, input2_ptr + 1, 4, 1,
-      input3_ptr + 1, 4);
-
-  EXPECT_EQ(input3_ptr[0], 0);
-  EXPECT_EQ(input3_ptr[1], 24);
-  EXPECT_EQ(input3_ptr[2], 28);
-  EXPECT_EQ(input3_ptr[3], 32);
-  EXPECT_EQ(input3_ptr[4], 4);
-  EXPECT_EQ(input3_ptr[5], 73);
-  EXPECT_EQ(input3_ptr[6], 86);
-  EXPECT_EQ(input3_ptr[7], 99);
-}
-
-TEST(math_function, gemm_trans_clbas) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
-
-  int m = 2;
-  int n = 3;
-  int k = 3;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
-  float arr1[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr1, 6 * sizeof(float));
-  float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
-  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
-  memcpy(input2_ptr, arr2, 12 * sizeof(float));
-  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
-  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  memcpy(input3_ptr, arr3, 8 * sizeof(float));
-
-  paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::gemm<paddle::platform::CPUDeviceContext, float>(
-      context, false, true, m, n, k, 1, input1_ptr, 3, input2_ptr + 3, 3, 1,
-      input3_ptr + 1, 4);
-
-  EXPECT_EQ(input3_ptr[0], 0);
-  EXPECT_EQ(input3_ptr[1], 24);
-  EXPECT_EQ(input3_ptr[2], 28);
-  EXPECT_EQ(input3_ptr[3], 32);
-  EXPECT_EQ(input3_ptr[4], 4);
-  EXPECT_EQ(input3_ptr[5], 73);
-  EXPECT_EQ(input3_ptr[6], 86);
-  EXPECT_EQ(input3_ptr[7], 99);
-}
-
-TEST(math_function, zero) {
-  paddle::framework::Tensor tensor;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* t = tensor.mutable_data<float>({2, 2}, *cpu_place);
-  paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      functor;
-  functor(context, &tensor, 0);
-  EXPECT_EQ(t[0], 0);
-  EXPECT_EQ(t[1], 0);
-  EXPECT_EQ(t[2], 0);
-  EXPECT_EQ(t[3], 0);
-
-  functor(context, &tensor, 1);
-
-  EXPECT_EQ(t[0], 1);
-  EXPECT_EQ(t[1], 1);
-  EXPECT_EQ(t[2], 1);
-  EXPECT_EQ(t[3], 1);
-}
-
-template <typename T>
-void GemvTest(int m, int n, bool trans) {
-  paddle::framework::Tensor mat_a;
-  paddle::framework::Tensor vec_b;
-  paddle::framework::Tensor vec_c;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  int b_num = trans ? m : n;
-  int c_num = trans ? n : m;
-
-  T* data_a = mat_a.mutable_data<T>({m, n}, *cpu_place);
-  T* data_b = vec_b.mutable_data<T>({b_num}, *cpu_place);
-  T* data_c = vec_c.mutable_data<T>({c_num}, *cpu_place);
-  for (int i = 0; i < mat_a.numel(); ++i) {
-    data_a[i] = static_cast<T>(i);
-  }
-  for (int i = 0; i < vec_b.numel(); ++i) {
-    data_b[i] = static_cast<T>(i);
-  }
-
-  paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::gemv<paddle::platform::CPUDeviceContext, T>(
-      context, trans, static_cast<int>(m), static_cast<int>(n), 1., data_a,
-      data_b, 0., data_c);
-
-  if (!trans) {
-    for (int i = 0; i < m; ++i) {
-      T sum = 0.0;
-      for (int j = 0; j < n; ++j) {
-        sum += data_a[i * n + j] * data_b[j];
-      }
-      ASSERT_FLOAT_EQ(data_c[i], sum);
-    }
-  } else {
-    for (int i = 0; i < n; ++i) {
-      T sum = 0.0;
-      for (int j = 0; j < m; ++j) {
-        sum += data_a[j * n + i] * data_b[j];
-      }
-      ASSERT_FLOAT_EQ(data_c[i], sum);
-    }
-  }
-}
-
-TEST(math_function, gemv) {
-  GemvTest<float>(3, 13, false);
-  GemvTest<double>(4, 5, false);
-  GemvTest<float>(12, 7, true);
-  GemvTest<double>(7, 9, true);
-}
-
-TEST(math_funciton, set_constant) {
-  paddle::framework::Tensor t;
-  t.Resize({10, 10});
-  t.mutable_data<int>(paddle::platform::CPUPlace());
-  auto* ctx = new paddle::platform::CPUDeviceContext();
-  paddle::operators::math::set_constant(*ctx, &t, 10);
-  for (int64_t i = 0; i < t.numel(); ++i) {
-    PADDLE_ENFORCE_EQ(10, t.data<int>()[i]);
-  }
-  delete ctx;
-}
diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu
deleted file mode 100644
index 6f16d6679248a88425e1de487988e50ee8a469bf..0000000000000000000000000000000000000000
--- a/paddle/operators/math/math_function_test.cu
+++ /dev/null
@@ -1,255 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "gtest/gtest.h"
-#include "paddle/operators/math/math_function.h"
-
-TEST(math_function, notrans_mul_trans) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor out_gpu;
-  paddle::framework::Tensor out;
-
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
-  float arr[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr, 6 * sizeof(float));
-
-  auto* gpu_place = new paddle::platform::CUDAPlace(0);
-  paddle::platform::CUDADeviceContext context(*gpu_place);
-
-  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
-  paddle::framework::Copy(input1, *gpu_place, context, &input2_gpu);
-
-  out_gpu.mutable_data<float>({2, 2}, *gpu_place);
-
-  paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
-      context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
-
-  paddle::framework::Copy(out_gpu, *cpu_place, context, &out);
-
-  float* out_ptr = out.data<float>();
-  context.Wait();
-  EXPECT_EQ(out_ptr[0], 5);
-  EXPECT_EQ(out_ptr[1], 14);
-  EXPECT_EQ(out_ptr[2], 14);
-  EXPECT_EQ(out_ptr[3], 50);
-  delete gpu_place;
-}
-
-TEST(math_function, trans_mul_notrans) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor out_gpu;
-  paddle::framework::Tensor out;
-
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
-  float arr[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr, 6 * sizeof(float));
-
-  auto* gpu_place = new paddle::platform::CUDAPlace(0);
-  paddle::platform::CUDADeviceContext context(*gpu_place);
-
-  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
-  paddle::framework::Copy(input1, *gpu_place, context, &input2_gpu);
-
-  out_gpu.mutable_data<float>({3, 3}, *gpu_place);
-
-  paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
-      context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
-
-  paddle::framework::Copy(out_gpu, *cpu_place, context, &out);
-
-  float* out_ptr = out.data<float>();
-  context.Wait();
-  EXPECT_EQ(out_ptr[0], 9);
-  EXPECT_EQ(out_ptr[1], 12);
-  EXPECT_EQ(out_ptr[2], 15);
-  EXPECT_EQ(out_ptr[3], 12);
-  EXPECT_EQ(out_ptr[4], 17);
-  EXPECT_EQ(out_ptr[5], 22);
-  EXPECT_EQ(out_ptr[6], 15);
-  EXPECT_EQ(out_ptr[7], 22);
-  EXPECT_EQ(out_ptr[8], 29);
-  delete gpu_place;
-}
-
-TEST(math_function, gemm_notrans_cublas) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor input3_gpu;
-
-  int m = 2;
-  int n = 3;
-  int k = 3;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
-  float arr1[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr1, 6 * sizeof(float));
-  float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
-  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  memcpy(input2_ptr, arr2, 12 * sizeof(float));
-  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
-  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  memcpy(input3_ptr, arr3, 8 * sizeof(float));
-
-  auto* gpu_place = new paddle::platform::CUDAPlace(0);
-  paddle::platform::CUDADeviceContext context(*gpu_place);
-
-  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
-  paddle::framework::Copy(input2, *gpu_place, context, &input2_gpu);
-  paddle::framework::Copy(input3, *gpu_place, context, &input3_gpu);
-  float* a = input1_gpu.data<float>();
-  float* b = input2_gpu.data<float>();
-  float* c = input3_gpu.mutable_data<float>(*gpu_place);
-
-  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
-      context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
-
-  paddle::framework::Copy(input3_gpu, *cpu_place, context, &input3);
-
-  // numpy code:
-  // a = np.arange(6).reshape(2, 3)
-  // b = np.arange(12).reshape(3, 4)[:, 1:]
-  // c = np.arange(8).reshape(2, 4)[:, 1:]
-  // out = np.arange(8).reshape(2, 4)
-  // out[:, 1:] = np.dot(a, b) + c
-  context.Wait();
-  EXPECT_EQ(input3_ptr[0], 0);
-  EXPECT_EQ(input3_ptr[1], 24);
-  EXPECT_EQ(input3_ptr[2], 28);
-  EXPECT_EQ(input3_ptr[3], 32);
-  EXPECT_EQ(input3_ptr[4], 4);
-  EXPECT_EQ(input3_ptr[5], 73);
-  EXPECT_EQ(input3_ptr[6], 86);
-  EXPECT_EQ(input3_ptr[7], 99);
-  delete gpu_place;
-}
-
-TEST(math_function, gemm_trans_cublas) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor input3_gpu;
-
-  int m = 2;
-  int n = 3;
-  int k = 3;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
-  float arr1[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr1, 6 * sizeof(float));
-  float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
-  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
-  memcpy(input2_ptr, arr2, 12 * sizeof(float));
-  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
-  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  memcpy(input3_ptr, arr3, 8 * sizeof(float));
-
-  auto* gpu_place = new paddle::platform::CUDAPlace(0);
-  paddle::platform::CUDADeviceContext context(*gpu_place);
-
-  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
-  paddle::framework::Copy(input2, *gpu_place, context, &input2_gpu);
-  paddle::framework::Copy(input3, *gpu_place, context, &input3_gpu);
-  float* a = input1_gpu.data<float>();
-  float* b = input2_gpu.data<float>();
-  float* c = input3_gpu.mutable_data<float>(*gpu_place);
-
-  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
-      context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
-
-  paddle::framework::Copy(input3_gpu, *cpu_place, context, &input3);
-  context.Wait();
-
-  EXPECT_EQ(input3_ptr[0], 0);
-  EXPECT_EQ(input3_ptr[1], 24);
-  EXPECT_EQ(input3_ptr[2], 28);
-  EXPECT_EQ(input3_ptr[3], 32);
-  EXPECT_EQ(input3_ptr[4], 4);
-  EXPECT_EQ(input3_ptr[5], 73);
-  EXPECT_EQ(input3_ptr[6], 86);
-  EXPECT_EQ(input3_ptr[7], 99);
-  delete gpu_place;
-}
-
-template <typename T>
-void GemvTest(int m, int n, bool trans) {
-  paddle::framework::Tensor mat_a;
-  paddle::framework::Tensor vec_b;
-  paddle::framework::Tensor vec_c;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-
-  T* data_a = mat_a.mutable_data<T>({m, n}, *cpu_place);
-  T* data_b = vec_b.mutable_data<T>({trans ? m : n}, *cpu_place);
-  T* data_c = vec_c.mutable_data<T>({trans ? n : m}, *cpu_place);
-
-  auto* gpu_place = new paddle::platform::CUDAPlace(0);
-  paddle::framework::Tensor g_mat_a;
-  paddle::framework::Tensor g_vec_b;
-  paddle::framework::Tensor g_vec_c;
-  T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), *gpu_place);
-  T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), *gpu_place);
-  T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), *gpu_place);
-
-  for (int i = 0; i < mat_a.numel(); ++i) {
-    data_a[i] = static_cast<T>(i);
-  }
-  for (int i = 0; i < vec_b.numel(); ++i) {
-    data_b[i] = static_cast<T>(i);
-  }
-
-  paddle::platform::CUDADeviceContext context(*gpu_place);
-  paddle::framework::Copy(mat_a, *gpu_place, context, &g_mat_a);
-  paddle::framework::Copy(vec_b, *gpu_place, context, &g_vec_b);
-
-  paddle::operators::math::gemv<paddle::platform::CUDADeviceContext, T>(
-      context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
-      g_data_b, 0., g_data_c);
-
-  paddle::framework::Copy(g_vec_c, paddle::platform::CPUPlace(), context,
-                          &vec_c);
-
-  if (!trans) {
-    for (int i = 0; i < m; ++i) {
-      T sum = 0.0;
-      for (int j = 0; j < n; ++j) {
-        sum += data_a[i * n + j] * data_b[j];
-      }
-      ASSERT_FLOAT_EQ(data_c[i], sum);
-    }
-  } else {
-    for (int i = 0; i < n; ++i) {
-      T sum = 0.0;
-      for (int j = 0; j < m; ++j) {
-        sum += data_a[j * n + i] * data_b[j];
-      }
-      ASSERT_FLOAT_EQ(data_c[i], sum);
-    }
-  }
-}
-
-TEST(math_function, gemv) {
-  GemvTest<float>(3, 13, false);
-  GemvTest<double>(3, 13, false);
-  GemvTest<float>(3, 13, true);
-  GemvTest<double>(3, 13, true);
-}
diff --git a/paddle/operators/math/matmul.h b/paddle/operators/math/matmul.h
deleted file mode 100644
index ae7f1fe9be5066a0ac0ac522849d481fc66a19be..0000000000000000000000000000000000000000
--- a/paddle/operators/math/matmul.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-// Implements the logic of numpy matmul:
-// https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html
-//
-// but allowing also for a, b to be transposed
-//
-// Both a & b can be 1- to 3-dimensional. Higher rank tensors are not supported
-// yet.
-template <typename DeviceContext, typename T>
-class MatMulFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& a,
-                  bool trans_a, const framework::Tensor& b, bool trans_b,
-                  T alpha, framework::Tensor* out, T beta) {
-    auto dim_a = a.dims();
-    auto dim_b = b.dims();
-
-    PADDLE_ENFORCE(a.place() == b.place() && b.place() == out->place(),
-                   "Tensors must all be in the same place.");
-    PADDLE_ENFORCE_GE(dim_a.size(), 1,
-                      "Input tensor a must be at least 1-dimensional.");
-    PADDLE_ENFORCE_GE(dim_b.size(), 1,
-                      "Input tensor b must be at least 1-dimensional.");
-
-    std::vector<int64_t> out_dim;
-    int64_t batch_count = 1;
-    if (dim_a.size() > 3) {
-      PADDLE_ENFORCE(dim_b.size() == dim_a.size(),
-                     "The dimensions of X and Y must be the same, and both of "
-                     "them should be %d-dimensional.",
-                     dim_b.size());
-      // The first rank-2 dimensions are accumulated on the batch_count, and the
-      // last two dimensions are used for matrix multiplication.
-      for (int j = 0; j < dim_a.size() - 2; ++j) {
-        PADDLE_ENFORCE_EQ(dim_b[j], dim_a[j],
-                          "The %d-th dimension of X and Y must be the same.",
-                          j);
-        out_dim.push_back(dim_a[j]);
-        batch_count *= dim_a[j];
-      }
-    }
-
-    int M = 0, N = 0, kA = 0, kB = 0, batchCountA = 0, batchCountB = 0,
-        strideA = 0, strideB = 0;
-
-    switch (dim_a.size()) {
-      case 1:
-        // similar to np.matmul:
-        // prepend dimension 1 (no transpose) or append dimension 1 (transpose)
-        M = trans_a ? dim_a[0] : 1;
-        kA = trans_a ? 1 : dim_a[0];
-        break;
-      case 2:
-        M = trans_a ? dim_a[1] : dim_a[0];
-        kA = trans_a ? dim_a[0] : dim_a[1];
-        break;
-      case 3:
-        batchCountA = dim_a[0];
-        M = trans_a ? dim_a[2] : dim_a[1];
-        kA = trans_a ? dim_a[1] : dim_a[2];
-        strideA = M * kA;
-        break;
-      default:
-        batchCountA = batch_count;
-        size_t mat_s = dim_a.size() - 2;
-        M = trans_a ? dim_a[mat_s + 1] : dim_a[mat_s];
-        kA = trans_a ? dim_a[mat_s] : dim_a[mat_s + 1];
-        strideA = M * kA;
-    }
-
-    switch (dim_b.size()) {
-      case 1:
-        // similar to np.matmul:
-        // append dimension 1 (no transpose) or prepend dimension 1 (transpose)
-        kB = trans_b ? 1 : dim_b[0];
-        N = trans_b ? dim_b[0] : 1;
-        break;
-      case 2:
-        kB = trans_b ? dim_b[1] : dim_b[0];
-        N = trans_b ? dim_b[0] : dim_b[1];
-        break;
-      case 3:
-        batchCountB = dim_b[0];
-        kB = trans_b ? dim_b[2] : dim_b[1];
-        N = trans_b ? dim_b[1] : dim_b[2];
-        strideB = kB * N;
-        break;
-      default:
-        batchCountB = batch_count;
-        size_t mat_s = dim_b.size() - 2;
-        kB = trans_b ? dim_b[mat_s + 1] : dim_b[mat_s];
-        N = trans_b ? dim_b[mat_s] : dim_b[mat_s + 1];
-        strideB = kB * N;
-    }
-
-    PADDLE_ENFORCE_EQ(
-        kA, kB,
-        "First matrix's width must be equal with second matrix's height.");
-    if (batchCountA && batchCountB) {
-      PADDLE_ENFORCE_EQ(
-          batchCountA, batchCountB,
-          "When input tensors a and b are both batched, they must have the "
-          "same batch dimension.");
-    }
-    int batchCount = std::max(batchCountA, batchCountB);
-
-    CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-    CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-    if (!batchCount) {
-      // regular matrix multiplication
-      gemm<DeviceContext, T>(context, transA, transB, M, N, kA, alpha,
-                             a.data<T>(), b.data<T>(), beta, out->data<T>());
-    } else {
-      // batched matrix multiplication
-      batched_gemm<DeviceContext, T>(
-          context, transA, transB, M, N, kA, alpha, a.data<T>(), b.data<T>(),
-          beta, out->data<T>(), batchCount, strideA, strideB);
-    }
-  }
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/maxouting.cc b/paddle/operators/math/maxouting.cc
deleted file mode 100644
index fea86675f75dad99a336d795d4561ae32d58c30a..0000000000000000000000000000000000000000
--- a/paddle/operators/math/maxouting.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/maxouting.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-// All tensors are in NCHW format, and the groups must be greater than 1
-template <typename T>
-class MaxOutFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* output,
-                  int groups) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    int fea_size = input_height * input_width;
-    // c_size means the output size of each sample
-    int c_size = fea_size * output_channels;
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-
-    for (int i = 0; i < batch_size; ++i) {
-      int new_bindex = c_size * i;
-      for (int c = 0; c < output_channels; ++c) {
-        int new_cindex = fea_size * c;
-        for (int f = 0; f < fea_size; ++f) {
-          T ele = static_cast<T>(-FLT_MAX);
-          for (int ph = 0; ph < groups; ++ph) {
-            T x = input_data[(new_bindex + new_cindex) * groups +
-                             ph * fea_size + f];
-            ele = ele > x ? ele : x;
-          }
-          output_data[(new_bindex + new_cindex + f)] = ele;
-        }
-      }
-    }
-  }
-};
-
-template <class T>
-class MaxOutGradFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* input_grad,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, int groups) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    int fea_size = input_height * input_width;
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    for (int i = 0; i < batch_size; ++i) {
-      int blen = fea_size * output_channels * i;
-      for (int c = 0; c < output_channels; ++c) {
-        int clen = fea_size * c;
-        for (int f = 0; f < fea_size; ++f) {
-          int input_idx0 = (blen + clen) * groups + f;
-          bool continue_match = true;
-          int output_idx = blen + clen + f;
-          for (int g = 0; g < groups && continue_match; ++g) {
-            int input_idx = input_idx0 + fea_size * g;
-            if (input_data[input_idx] == output_data[output_idx]) {
-              input_grad_data[input_idx] += output_grad_data[output_idx];
-              continue_match = false;
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template class MaxOutGradFunctor<platform::CPUDeviceContext, float>;
-template class MaxOutGradFunctor<platform::CPUDeviceContext, double>;
-template class MaxOutFunctor<platform::CPUDeviceContext, float>;
-template class MaxOutFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/maxouting.cu b/paddle/operators/math/maxouting.cu
deleted file mode 100644
index 6056ad251c12976fe9032f03aaaeb52727da1f42..0000000000000000000000000000000000000000
--- a/paddle/operators/math/maxouting.cu
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/maxouting.h"
-#include "paddle/platform/cuda_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-__global__ void KernelMaxOut(const int nthreads, const T* input_data,
-                             const int channels, const int input_height,
-                             const int input_width, int groups,
-                             T* output_data) {
-  const int size = input_height * input_width * channels / groups;
-  const int feat_len = input_height * input_width;
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    int batch_idx = i / size;
-    int batch_offset = i % size;
-    int channel_idx = batch_offset / feat_len;
-    int feat_idx = batch_offset % feat_len;
-    int data_idx =
-        (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
-    T ele = static_cast<T>(-FLT_MAX);
-    for (int g = 0; g < groups; ++g) {
-      T x = input_data[data_idx + g * feat_len];
-      ele = ele > x ? ele : x;
-    }
-    output_data[i] = ele;
-  }
-}
-template <typename T>
-__global__ void KernelMaxoutGrad(const int nthreads, const T* input_data,
-                                 const T* output_data, const T* output_grad,
-                                 T* input_grad, const int channels,
-                                 const int input_height, const int input_width,
-                                 int groups) {
-  const int size = input_height * input_width * channels / groups;
-  const int feat_len = input_height * input_width;
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    int batch_idx = i / size;
-    int batch_offset = i % size;
-    int channel_idx = batch_offset / feat_len;
-    int feat_idx = batch_offset % feat_len;
-    int data_idx =
-        (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
-    int max_index = -1;
-    bool continue_match = true;
-    for (int g = 0; g < groups && continue_match; ++g) {
-      if (input_data[data_idx + g * feat_len] == output_data[i]) {
-        max_index = data_idx + g * feat_len;
-        continue_match = false;
-        break;
-      }
-    }
-    if (max_index != -1) {
-      input_grad[max_index] += output_grad[index];
-    }
-  }
-}
-/*
- * All tensors are in NCHW format.
- */
-template <typename T>
-class MaxOutFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* output,
-                  int groups) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
-
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int nthreads = output->numel();
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxOut<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_height, input_width, groups,
-        output_data);
-  }
-};
-/*
- * All tensors are in NCHW format.
- */
-template <typename T>
-class MaxOutGradFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* input_grad,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, int groups) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-    int nthreads = output.numel();
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxoutGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_height, input_width, groups);
-  }
-};
-
-template class MaxOutGradFunctor<platform::CUDADeviceContext, float>;
-template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
-
-template class MaxOutFunctor<platform::CUDADeviceContext, float>;
-template class MaxOutFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/maxouting.h b/paddle/operators/math/maxouting.h
deleted file mode 100644
index 68f4743db07af0f369eb18f1a7cb6e326d469e85..0000000000000000000000000000000000000000
--- a/paddle/operators/math/maxouting.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-#define FLT_MAX __FLT_MAX__
-
-template <typename DeviceContext, typename T>
-class MaxOutFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  framework::Tensor* output, int groups);
-};
-
-template <typename DeviceContext, class T>
-class MaxOutGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  framework::Tensor* input_grad,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, int groups);
-};
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/pooling.cc b/paddle/operators/math/pooling.cc
deleted file mode 100644
index 150de6fd59ef3ac0c4cb9160bf5afb1ce1064577..0000000000000000000000000000000000000000
--- a/paddle/operators/math/pooling.cc
+++ /dev/null
@@ -1,760 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/pooling.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename PoolProcess, typename T>
-class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_process, framework::Tensor* output) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-
-    const int input_stride = input_height * input_width;
-    const int output_stride = output_height * output_width;
-
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int ph = 0; ph < output_height; ++ph) {
-          int hstart = ph * stride_height - padding_height;
-          int hend = std::min(hstart + ksize_height, input_height);
-          hstart = std::max(hstart, 0);
-          for (int pw = 0; pw < output_width; ++pw) {
-            int wstart = pw * stride_width - padding_width;
-            int wend = std::min(wstart + ksize_width, input_width);
-            wstart = std::max(wstart, 0);
-
-            T ele = pool_process.initial();
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                pool_process.compute(ele, input_data[h * input_width + w]);
-              }
-            }
-            int pool_size = (hend - hstart) * (wend - wstart);
-            pool_process.finalize(ele, (static_cast<T>(pool_size)));
-            output_data[ph * output_width + pw] = ele;
-          }
-        }
-        input_data += input_stride;
-        output_data += output_stride;
-      }
-    }
-  }
-};
-
-/*
-* All tensors are in NCHW format.
-* Ksize, strides, paddings are two elements. These two elements represent height
-* and width, respectively.
-*/
-template <typename PoolProcess, class T>
-class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_grad_process,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-    const int input_stride = input_height * input_width;
-    const int output_stride = output_height * output_width;
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int ph = 0; ph < output_height; ++ph) {
-          int hstart = ph * stride_height - padding_height;
-          int hend = std::min(hstart + ksize_height, input_height);
-          hstart = std::max(hstart, 0);
-          for (int pw = 0; pw < output_width; ++pw) {
-            int wstart = pw * stride_width - padding_width;
-            int wend = std::min(wstart + ksize_width, input_width);
-            wstart = std::max(wstart, 0);
-            int pool_size = (hend - hstart) * (wend - wstart);
-            float scale = 1.0 / pool_size;
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                pool_grad_process.compute(
-                    input_data[h * input_width + w],
-                    output_data[ph * output_width + pw],
-                    output_grad_data[ph * output_width + pw],
-                    input_grad_data[h * input_width + w],
-                    static_cast<T>(scale));
-              }
-            }
-          }
-        }
-        input_data += input_stride;
-        output_data += output_stride;
-        input_grad_data += input_stride;
-        output_grad_data += output_stride;
-      }
-    }
-  }
-};
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <class T>
-class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-    const int input_stride = input_height * input_width;
-    const int output_stride = output_height * output_width;
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int ph = 0; ph < output_height; ++ph) {
-          int hstart = ph * stride_height - padding_height;
-          int hend = std::min(hstart + ksize_height, input_height);
-          hstart = std::max(hstart, 0);
-          for (int pw = 0; pw < output_width; ++pw) {
-            int wstart = pw * stride_width - padding_width;
-            int wend = std::min(wstart + ksize_width, input_width);
-            wstart = std::max(wstart, 0);
-
-            bool stop = false;
-            for (int h = hstart; h < hend && !stop; ++h) {
-              for (int w = wstart; w < wend && !stop; ++w) {
-                int input_idx = h * input_width + w;
-                int output_idx = ph * output_width + pw;
-                if (input_data[input_idx] == output_data[output_idx]) {
-                  input_grad_data[input_idx] += output_grad_data[output_idx];
-                  stop = true;
-                }
-              }
-            }
-          }
-        }
-        input_data += input_stride;
-        output_data += output_stride;
-        input_grad_data += input_stride;
-        output_grad_data += output_stride;
-      }
-    }
-  }
-};
-
-template class MaxPool2dGradFunctor<platform::CPUDeviceContext, float>;
-template class MaxPool2dGradFunctor<platform::CPUDeviceContext, double>;
-
-template class Pool2dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool2dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::AvgPool<float>, float>;
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool2dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool2dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::AvgPool<double>, double>;
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<double>,
-                                 double>;
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename PoolProcess, class T>
-class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_process, framework::Tensor* output) {
-    const int batch_size = input.dims()[0];
-    const int input_depth = input.dims()[2];
-    const int input_height = input.dims()[3];
-    const int input_width = input.dims()[4];
-    const int output_channels = output->dims()[1];
-    const int output_depth = output->dims()[2];
-    const int output_height = output->dims()[3];
-    const int output_width = output->dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-
-    const int input_stride = input_depth * input_height * input_width;
-    const int output_stride = output_depth * output_height * output_width;
-
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int pd = 0; pd < output_depth; ++pd) {
-          int dstart = pd * stride_depth - padding_depth;
-          int dend = std::min(dstart + ksize_depth, input_depth);
-          dstart = std::max(dstart, 0);
-          for (int ph = 0; ph < output_height; ++ph) {
-            int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
-            for (int pw = 0; pw < output_width; ++pw) {
-              int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
-              int output_idx = (pd * output_height + ph) * output_width + pw;
-              T ele = pool_process.initial();
-              for (int d = dstart; d < dend; ++d) {
-                for (int h = hstart; h < hend; ++h) {
-                  for (int w = wstart; w < wend; ++w) {
-                    pool_process.compute(
-                        ele,
-                        input_data[(d * input_height + h) * input_width + w]);
-                  }
-                }
-              }
-              int pool_size =
-                  (dend - dstart) * (hend - hstart) * (wend - wstart);
-              pool_process.finalize(ele, static_cast<T>(pool_size));
-              output_data[output_idx] = ele;
-            }
-          }
-        }
-        input_data += input_stride;
-        output_data += output_stride;
-      }
-    }
-  }
-};
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename PoolProcess, class T>
-class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_grad_process,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_depth = input.dims()[2];
-    const int input_height = input.dims()[3];
-    const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-    const int input_stride = input_depth * input_height * input_width;
-    const int output_stride = output_depth * output_height * output_width;
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int pd = 0; pd < output_depth; ++pd) {
-          int dstart = pd * stride_depth - padding_depth;
-          int dend = std::min(dstart + ksize_depth, input_depth);
-          dstart = std::max(dstart, 0);
-          for (int ph = 0; ph < output_height; ++ph) {
-            int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
-
-            for (int pw = 0; pw < output_width; ++pw) {
-              int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
-
-              int pool_size =
-                  (dend - dstart) * (hend - hstart) * (wend - wstart);
-              float scale = 1.0 / pool_size;
-              for (int d = dstart; d < dend; ++d) {
-                for (int h = hstart; h < hend; ++h) {
-                  for (int w = wstart; w < wend; ++w) {
-                    int input_idx = (d * input_height + h) * input_width + w;
-                    int output_idx =
-                        (pd * output_height + ph) * output_width + pw;
-                    pool_grad_process.compute(
-                        input_data[input_idx], output_data[output_idx],
-                        output_grad_data[output_idx],
-                        input_grad_data[input_idx], static_cast<T>(scale));
-                  }
-                }
-              }
-            }
-          }
-        }
-        input_data += input_stride;
-        output_data += output_stride;
-        input_grad_data += input_stride;
-        output_grad_data += output_stride;
-      }
-    }
-  }
-};
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <class T>
-class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_depth = input.dims()[2];
-    const int input_height = input.dims()[3];
-    const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-    const int input_stride = input_depth * input_height * input_width;
-    const int output_stride = output_depth * output_height * output_width;
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int pd = 0; pd < output_depth; ++pd) {
-          int dstart = pd * stride_depth - padding_depth;
-          int dend = std::min(dstart + ksize_depth, input_depth);
-          dstart = std::max(dstart, 0);
-          for (int ph = 0; ph < output_height; ++ph) {
-            int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
-            for (int pw = 0; pw < output_width; ++pw) {
-              int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
-              bool stop = false;
-              for (int d = dstart; d < dend && !stop; ++d) {
-                for (int h = hstart; h < hend && !stop; ++h) {
-                  for (int w = wstart; w < wend && !stop; ++w) {
-                    int input_idx = (d * input_height + h) * input_width + w;
-                    int output_idx =
-                        (pd * output_height + ph) * output_width + pw;
-
-                    if (input_data[input_idx] == output_data[output_idx]) {
-                      input_grad_data[input_idx] +=
-                          output_grad_data[output_idx];
-                      stop = true;
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-        input_data += input_stride;
-        output_data += output_stride;
-        input_grad_data += input_stride;
-        output_grad_data += output_stride;
-      }
-    }
-  }
-};
-
-template class MaxPool3dGradFunctor<platform::CPUDeviceContext, float>;
-template class MaxPool3dGradFunctor<platform::CPUDeviceContext, double>;
-
-template class Pool3dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool3dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::AvgPool<float>, float>;
-template class Pool3dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool3dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool3dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool3dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::AvgPool<double>, double>;
-template class Pool3dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool3dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<double>,
-                                 double>;
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename T1, typename T2>
-class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* output, framework::Tensor* mask) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-    const int input_stride = input_height * input_width;
-    const int output_stride = output_height * output_width;
-
-    const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(context.GetPlace());
-    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
-
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int ph = 0; ph < output_height; ++ph) {
-          int hstart = ph * stride_height - padding_height;
-          int hend = std::min(hstart + ksize_height, input_height);
-          hstart = std::max(hstart, 0);
-          for (int pw = 0; pw < output_width; ++pw) {
-            int wstart = pw * stride_width - padding_width;
-            int wend = std::min(wstart + ksize_width, input_width);
-            wstart = std::max(wstart, 0);
-
-            T1 ele = static_cast<T1>(-FLT_MAX);
-            int index = -1;
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                if (ele < input_data[h * input_width + w]) {
-                  ele = input_data[h * input_width + w];
-                  index = h * input_width + w;
-                }
-              }
-            }
-            output_data[ph * output_width + pw] = ele;
-            mask_data[ph * output_width + pw] = index;
-          }
-        }
-        // offset
-        input_data += input_stride;
-        output_data += output_stride;
-        mask_data += output_stride;
-      }
-    }
-  }
-};
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input_grad->dims()[0];
-    const int input_height = input_grad->dims()[2];
-    const int input_width = input_grad->dims()[3];
-    const int output_channels = output_grad.dims()[1];
-    const int output_height = output_grad.dims()[2];
-    const int output_width = output_grad.dims()[3];
-    const int input_stride = input_height * input_width;
-    const int output_stride = output_height * output_width;
-
-    const T2* mask_data = mask.data<T2>();
-    const T1* output_grad_data = output_grad.data<T1>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
-
-    for (int n = 0; n < batch_size; ++n) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int ph = 0; ph < output_height; ++ph) {
-          for (int pw = 0; pw < output_width; ++pw) {
-            const int output_idx = ph * output_width + pw;
-            const int input_idx = static_cast<int>(mask_data[output_idx]);
-            input_grad_data[input_idx] += output_grad_data[output_idx];
-          }
-        }
-        // offset
-        input_grad_data += input_stride;
-        output_grad_data += output_stride;
-        mask_data += output_stride;
-      }
-    }
-  }
-};
-
-template class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, float,
-                                         int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, float,
-                                             int>;
-template class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, double,
-                                         int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, double,
-                                             int>;
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename T1, typename T2>
-class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* output, framework::Tensor* mask) {
-    const int batch_size = input.dims()[0];
-    const int input_depth = input.dims()[2];
-    const int input_height = input.dims()[3];
-    const int input_width = input.dims()[4];
-    const int output_channels = output->dims()[1];
-    const int output_depth = output->dims()[2];
-    const int output_height = output->dims()[3];
-    const int output_width = output->dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-    const int input_stride = input_depth * input_height * input_width;
-    const int output_stride = output_depth * output_height * output_width;
-
-    const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(context.GetPlace());
-    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
-
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int pd = 0; pd < output_depth; ++pd) {
-          int dstart = pd * stride_depth - padding_depth;
-          int dend = std::min(dstart + ksize_depth, input_depth);
-          dstart = std::max(dstart, 0);
-          for (int ph = 0; ph < output_height; ++ph) {
-            int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
-            for (int pw = 0; pw < output_width; ++pw) {
-              int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
-
-              int output_idx = (pd * output_height + ph) * output_width + pw;
-              T1 ele = static_cast<T1>(-FLT_MAX);
-              int index = -1;
-              for (int d = dstart; d < dend; ++d) {
-                for (int h = hstart; h < hend; ++h) {
-                  for (int w = wstart; w < wend; ++w) {
-                    int input_idx = (d * input_height + h) * input_width + w;
-                    if (ele < input_data[input_idx]) {
-                      index = input_idx;
-                      ele = input_data[input_idx];
-                    }
-                  }
-                }
-              }
-              output_data[output_idx] = ele;
-              mask_data[output_idx] = index;
-            }
-          }
-        }
-        // offset
-        input_data += input_stride;
-        output_data += output_stride;
-        mask_data += output_stride;
-      }
-    }
-  }
-};
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input_grad->dims()[0];
-    const int input_depth = input_grad->dims()[2];
-    const int input_height = input_grad->dims()[3];
-    const int input_width = input_grad->dims()[4];
-    const int output_channels = output_grad.dims()[1];
-    const int output_depth = output_grad.dims()[2];
-    const int output_height = output_grad.dims()[3];
-    const int output_width = output_grad.dims()[4];
-    const int input_stride = input_depth * input_height * input_width;
-    const int output_stride = output_depth * output_height * output_width;
-
-    const T2* mask_data = mask.data<T2>();
-    const T1* output_grad_data = output_grad.data<T1>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
-
-    for (int n = 0; n < batch_size; ++n) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int pd = 0; pd < output_depth; ++pd) {
-          for (int ph = 0; ph < output_height; ++ph) {
-            for (int pw = 0; pw < output_width; ++pw) {
-              const int output_idx =
-                  (pd * output_height + ph) * output_width + pw;
-              const int input_idx = static_cast<int>(mask_data[output_idx]);
-              input_grad_data[input_idx] += output_grad_data[output_idx];
-            }
-          }
-        }
-        // offset
-        input_grad_data += input_stride;
-        output_grad_data += output_stride;
-        mask_data += output_stride;
-      }
-    }
-  }
-};
-
-template class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, float,
-                                         int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, float,
-                                             int>;
-template class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, double,
-                                         int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, double,
-                                             int>;
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/pooling.cu b/paddle/operators/math/pooling.cu
deleted file mode 100644
index 0243cf8316a2a83bfc4c091f64419574c1be2f5c..0000000000000000000000000000000000000000
--- a/paddle/operators/math/pooling.cu
+++ /dev/null
@@ -1,1041 +0,0 @@
-/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/pooling.h"
-#include "paddle/platform/cuda_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename PoolProcess, typename T>
-__global__ void KernelPool2D(const int nthreads, const T* input_data,
-                             const int channels, const int input_height,
-                             const int input_width, const int output_height,
-                             const int output_width, const int ksize_height,
-                             const int ksize_width, const int stride_height,
-                             const int stride_width, const int padding_height,
-                             const int padding_width, PoolProcess pool_process,
-                             T* output_data) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-       index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
-    int ph = (index / output_width) % output_height;
-    int c = (index / output_width / output_height) % channels;
-    int batch_idx = index / output_width / output_height / channels;
-
-    int hstart = ph * stride_height - padding_height;
-    int hend = min(hstart + ksize_height, input_height);
-    hstart = max(hstart, 0);
-
-    int wstart = pw * stride_width - padding_width;
-    int wend = min(wstart + ksize_width, input_width);
-    wstart = max(wstart, 0);
-
-    input_data += (batch_idx * channels + c) * input_height * input_width;
-    T ele = pool_process.initial();
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        pool_process.compute(ele, input_data[h * input_width + w]);
-      }
-    }
-    int pool_size = (hend - hstart) * (wend - wstart);
-    pool_process.finalize(ele, (static_cast<T>(pool_size)));
-    output_data[index] = ele;
-  }
-}
-
-template <typename PoolProcess, typename T>
-__global__ void KernelPool2DGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, const int channels, const int input_height,
-    const int input_width, const int output_height, const int output_width,
-    const int ksize_height, const int ksize_width, const int stride_height,
-    const int stride_width, const int padding_height, const int padding_width,
-    PoolProcess pool_process, T* input_grad) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-       index += blockDim.x * gridDim.x) {
-    int offsetW = index % input_width + padding_width;
-    int offsetH = (index / input_width) % input_height + padding_height;
-    int offsetC = (index / input_width / input_height) % channels;
-    int batch_idx = index / input_width / input_height / channels;
-
-    int phstart = (offsetH < ksize_height)
-                      ? 0
-                      : (offsetH - ksize_height) / stride_height + 1;
-    int pwstart = (offsetW < ksize_width)
-                      ? 0
-                      : (offsetW - ksize_width) / stride_width + 1;
-    int phend = min(offsetH / stride_height + 1, output_height);
-    int pwend = min(offsetW / stride_width + 1, output_width);
-    T gradient = 0;
-    T input = input_data[index];
-    int output_idx =
-        (batch_idx * channels + offsetC) * output_height * output_width;
-    output_data += output_idx;
-    output_grad += output_idx;
-    for (int ph = phstart; ph < phend; ++ph) {
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        int hstart = ph * stride_height - padding_height;
-        int wstart = pw * stride_width - padding_width;
-        int hend = min(hstart + ksize_height, input_height);
-        int wend = min(wstart + ksize_width, input_width);
-        hstart = max(hstart, 0);
-        wstart = max(wstart, 0);
-        int pool_size = (hend - hstart) * (wend - wstart);
-        int output_sub_idx = ph * output_width + pw;
-        pool_process.compute(input, output_data[output_sub_idx],
-                             output_grad[output_sub_idx], gradient,
-                             static_cast<T>(1.0 / pool_size));
-      }
-    }
-    input_grad[index] = gradient;
-  }
-}
-
-template <typename T>
-__global__ void KernelMaxPool2DGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, const int channels, const int input_height,
-    const int input_width, const int output_height, const int output_width,
-    const int ksize_height, const int ksize_width, const int stride_height,
-    const int stride_width, const int padding_height, const int padding_width,
-    T* input_grad) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-       index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
-    int ph = (index / output_width) % output_height;
-    int c = (index / output_width / output_height) % channels;
-    int batch_idx = index / output_width / output_height / channels;
-
-    int hstart = ph * stride_height - padding_height;
-    int hend = min(hstart + ksize_height, input_height);
-    hstart = max(hstart, 0);
-
-    int wstart = pw * stride_width - padding_width;
-    int wend = min(wstart + ksize_width, input_width);
-    wstart = max(wstart, 0);
-
-    input_data += (batch_idx * channels + c) * input_height * input_width;
-    input_grad += (batch_idx * channels + c) * input_height * input_width;
-
-    T ele = output_data[index];
-    int maxIndex = -1;
-    bool stop = false;
-    for (int h = hstart; h < hend && !stop; ++h) {
-      for (int w = wstart; w < wend && !stop; ++w) {
-        if (ele == input_data[h * input_width + w]) {
-          maxIndex = h * input_width + w;
-          stop = true;
-        }
-      }
-    }
-
-    if (maxIndex != -1) {
-      // atomic add
-      platform::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]);
-    }
-  }
-}
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename PoolProcess, typename T>
-class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_process, framework::Tensor* output) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-
-    int nthreads = batch_size * output_channels * output_height * output_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_height, input_width,
-        output_height, output_width, ksize_height, ksize_width, stride_height,
-        stride_width, padding_height, padding_width, pool_process, output_data);
-  }
-};
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename PoolProcess, typename T>
-class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_process, framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    int nthreads = batch_size * input_channels * input_height * input_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelPool2DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_height, input_width, output_height, output_width, ksize_height,
-        ksize_width, stride_height, stride_width, padding_height, padding_width,
-        pool_process, input_grad_data);
-  }
-};
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename T>
-class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    int nthreads = batch_size * output_channels * output_height * output_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_height, input_width, output_height, output_width, ksize_height,
-        ksize_width, stride_height, stride_width, padding_height, padding_width,
-        input_grad_data);
-  }
-};
-
-template class MaxPool2dGradFunctor<platform::CUDADeviceContext, float>;
-template class MaxPool2dGradFunctor<platform::CUDADeviceContext, double>;
-
-template class Pool2dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool2dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::AvgPool<float>, float>;
-template class Pool2dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool2dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool2dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool2dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::AvgPool<double>, double>;
-template class Pool2dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool2dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<double>,
-                                 double>;
-
-template <typename PoolProcess, typename T>
-__global__ void KernelPool3D(const int nthreads, const T* input_data,
-                             const int channels, const int input_depth,
-                             const int input_height, const int input_width,
-                             const int output_depth, const int output_height,
-                             const int output_width, const int ksize_depth,
-                             const int ksize_height, const int ksize_width,
-                             const int stride_depth, const int stride_height,
-                             const int stride_width, const int padding_depth,
-                             const int padding_height, const int padding_width,
-                             PoolProcess pool_process, T* output_data) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-       index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
-    int ph = (index / output_width) % output_height;
-    int pd = (index / output_width / output_height) % output_depth;
-    int c = (index / output_width / output_height / output_depth) % channels;
-    int batch_idx =
-        index / output_width / output_height / output_depth / channels;
-    int dstart = pd * stride_depth - padding_depth;
-    int hstart = ph * stride_height - padding_height;
-    int wstart = pw * stride_width - padding_width;
-    int dend = min(dstart + ksize_depth, input_depth);
-    int hend = min(hstart + ksize_height, input_height);
-    int wend = min(wstart + ksize_width, input_width);
-    dstart = max(dstart, 0);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    T ele = pool_process.initial();
-    input_data +=
-        (batch_idx * channels + c) * input_depth * input_height * input_width;
-    for (int d = dstart; d < dend; ++d) {
-      for (int h = hstart; h < hend; ++h) {
-        for (int w = wstart; w < wend; ++w) {
-          pool_process.compute(
-              ele, input_data[(d * input_height + h) * input_width + w]);
-        }
-      }
-    }
-    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-    pool_process.finalize(ele, static_cast<T>(pool_size));
-    output_data[index] = ele;
-  }
-}
-
-template <typename PoolProcess, typename T>
-__global__ void KernelPool3DGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, const int channels, const int input_depth,
-    const int input_height, const int input_width, const int output_depth,
-    const int output_height, const int output_width, const int ksize_depth,
-    const int ksize_height, const int ksize_width, const int stride_depth,
-    const int stride_height, const int stride_width, const int padding_depth,
-    const int padding_height, const int padding_width, PoolProcess pool_process,
-    T* input_grad) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-       index += blockDim.x * gridDim.x) {
-    int offsetW = index % input_width + padding_width;
-    int offsetH = (index / input_width) % input_height + padding_height;
-    int offsetD =
-        (index / input_width / input_height) % input_depth + padding_depth;
-    int offsetC = (index / input_width / input_height / input_depth) % channels;
-    int batch_idx = index / input_width / input_height / input_depth / channels;
-
-    int pdstart = (offsetD < ksize_depth)
-                      ? 0
-                      : (offsetD - ksize_depth) / stride_depth + 1;
-    int phstart = (offsetH < ksize_height)
-                      ? 0
-                      : (offsetH - ksize_height) / stride_height + 1;
-    int pwstart = (offsetW < ksize_width)
-                      ? 0
-                      : (offsetW - ksize_width) / stride_width + 1;
-    int pdend = min((offsetD) / stride_depth + 1, output_depth);
-    int phend = min((offsetH) / stride_height + 1, output_height);
-    int pwend = min((offsetW) / stride_width + 1, output_width);
-
-    T gradient = 0;
-    T input = input_data[index];
-    int output_idx = (batch_idx * channels + offsetC) * output_depth *
-                     output_height * output_width;
-    output_data += output_idx;
-    output_grad += output_idx;
-
-    for (int pd = pdstart; pd < pdend; ++pd) {
-      for (int ph = phstart; ph < phend; ++ph) {
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          // figure out the pooling size
-          int dstart = pd * stride_depth - padding_depth;
-          int hstart = ph * stride_height - padding_height;
-          int wstart = pw * stride_width - padding_width;
-          int dend = min(dstart + ksize_depth, input_depth);
-          int hend = min(hstart + ksize_height, input_height);
-          int wend = min(wstart + ksize_width, input_width);
-          dstart = max(dstart, 0);
-          hstart = max(hstart, 0);
-          wstart = max(wstart, 0);
-          int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-          int output_sub_idx = (pd * output_height + ph) * output_width + pw;
-          pool_process.compute(input, output_data[output_sub_idx],
-                               output_grad[output_sub_idx], gradient,
-                               static_cast<T>(1.0 / pool_size));
-        }
-      }
-    }
-    input_grad[index] = gradient;
-  }
-}
-
-template <typename T>
-__global__ void KernelMaxPool3DGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, const int channels, const int input_depth,
-    const int input_height, const int input_width, const int output_depth,
-    const int output_height, const int output_width, const int ksize_depth,
-    const int ksize_height, const int ksize_width, const int stride_depth,
-    const int stride_height, const int stride_width, const int padding_depth,
-    const int padding_height, const int padding_width, T* input_grad) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-       index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
-    int ph = (index / output_width) % output_height;
-    int pd = (index / output_width / output_height) % output_depth;
-    int c = (index / output_width / output_height / output_depth) % channels;
-    int batch_idx =
-        index / output_width / output_height / output_depth / channels;
-    int dstart = pd * stride_depth - padding_depth;
-    int hstart = ph * stride_height - padding_height;
-    int wstart = pw * stride_width - padding_width;
-    int dend = min(dstart + ksize_depth, input_depth);
-    int hend = min(hstart + ksize_height, input_height);
-    int wend = min(wstart + ksize_width, input_width);
-    dstart = max(dstart, 0);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    T ele = output_data[index];
-    bool stop = false;
-    int maxIdx = -1;
-    input_data +=
-        (batch_idx * channels + c) * input_depth * input_height * input_width;
-    input_grad +=
-        (batch_idx * channels + c) * input_depth * input_height * input_width;
-
-    for (int d = dstart; d < dend && !stop; ++d) {
-      for (int h = hstart; h < hend && !stop; ++h) {
-        for (int w = wstart; w < wend && !stop; ++w) {
-          if (ele == input_data[(d * input_height + h) * input_width + w]) {
-            stop = true;
-            maxIdx = (d * input_height + h) * input_width + w;
-          }
-        }
-      }
-    }
-    if (maxIdx != -1) {
-      // atomic add
-      platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]);
-    }
-  }
-}
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename PoolProcess, class T>
-class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_process, framework::Tensor* output) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_depth = input.dims()[2];
-    const int input_height = input.dims()[3];
-    const int input_width = input.dims()[4];
-    const int output_channels = output->dims()[1];
-    const int output_depth = output->dims()[2];
-    const int output_height = output->dims()[3];
-    const int output_width = output->dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-
-    int nthreads = batch_size * output_channels * output_depth * output_height *
-                   output_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_depth, input_height,
-        input_width, output_depth, output_height, output_width, ksize_depth,
-        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
-        padding_depth, padding_height, padding_width, pool_process,
-        output_data);
-  }
-};
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename PoolProcess, class T>
-class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_process, framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_depth = input.dims()[2];
-    const int input_height = input.dims()[3];
-    const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    int nthreads =
-        batch_size * input_channels * input_depth * input_height * input_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelPool3DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width, pool_process, input_grad_data);
-  }
-};
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <class T>
-class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_depth = input.dims()[2];
-    const int input_height = input.dims()[3];
-    const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    int nthreads = batch_size * output_channels * output_depth * output_height *
-                   output_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxPool3DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width, input_grad_data);
-  }
-};
-
-template class MaxPool3dGradFunctor<platform::CUDADeviceContext, float>;
-template class MaxPool3dGradFunctor<platform::CUDADeviceContext, double>;
-
-template class Pool3dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool3dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::AvgPool<float>, float>;
-template class Pool3dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool3dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool3dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool3dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::AvgPool<double>, double>;
-template class Pool3dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool3dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<double>,
-                                 double>;
-
-template <typename T1, typename T2>
-__global__ void KernelMaxPool2dWithIdx(
-    const int nthreads, const T1* input_data, const int channels,
-    const int input_height, const int input_width, const int output_height,
-    const int output_width, const int ksize_height, const int ksize_width,
-    const int stride_height, const int stride_width, const int padding_height,
-    const int padding_width, T1* output_data, T2* mask_data) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-       index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
-    int ph = (index / output_width) % output_height;
-    int c = (index / output_width / output_height) % channels;
-    int batch_idx = index / output_width / output_height / channels;
-
-    int hstart = ph * stride_height - padding_height;
-    int hend = min(hstart + ksize_height, input_height);
-    hstart = max(hstart, 0);
-
-    int wstart = pw * stride_width - padding_width;
-    int wend = min(wstart + ksize_width, input_width);
-    wstart = max(wstart, 0);
-
-    input_data += (batch_idx * channels + c) * input_height * input_width;
-    T1 ele = -FLT_MAX;
-    int max_index = -1;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        int input_index = h * input_width + w;
-        if (ele < input_data[input_index]) {
-          max_index = input_index;
-          ele = input_data[input_index];
-        }
-      }
-    }
-    output_data[index] = ele;
-    mask_data[index] = max_index;
-  }
-}
-
-template <typename T1, typename T2>
-__global__ void KernelMaxPool2DWithIdxGrad(
-    const int nthreads, const T1* output_grad, const T2* mask_data,
-    const int channels, const int input_height, const int input_width,
-    const int output_height, const int output_width, const int ksize_height,
-    const int ksize_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, T1* input_grad) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-       index += blockDim.x * gridDim.x) {
-    int w_offset = index % input_width;
-    int h_offset = (index / input_width) % input_height;
-    int c_offset = (index / input_width / input_height) % channels;
-    int batch_idx = index / input_width / input_height / channels;
-
-    int ph_start =
-        (h_offset + padding_height < ksize_height)
-            ? 0
-            : (h_offset + padding_height - ksize_height) / stride_height + 1;
-    int pw_start =
-        (w_offset + padding_width < ksize_width)
-            ? 0
-            : (w_offset + padding_width - ksize_width) / stride_width + 1;
-    int ph_end =
-        min((h_offset + padding_height) / stride_height + 1, output_height);
-    int pw_end =
-        min((w_offset + padding_width) / stride_width + 1, output_width);
-
-    T1 gradient = 0;
-    int input_current_featuremap_idx = h_offset * input_width + w_offset;
-    int output_idx =
-        (batch_idx * channels + c_offset) * output_height * output_width;
-
-    mask_data += output_idx;
-    output_grad += output_idx;
-    for (int ph = ph_start; ph < ph_end; ++ph) {
-      for (int pw = pw_start; pw < pw_end; ++pw) {
-        if (mask_data[ph * output_width + pw] == input_current_featuremap_idx)
-          gradient += output_grad[ph * output_width + pw];
-      }
-    }
-    input_grad[index] = gradient;
-  }
-}
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename T1, typename T2>
-class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* output, framework::Tensor* mask) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-
-    const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(context.GetPlace());
-    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
-
-    int nthreads = batch_size * output_channels * output_height * output_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxPool2dWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_height, input_width,
-        output_height, output_width, ksize_height, ksize_width, stride_height,
-        stride_width, padding_height, padding_width, output_data, mask_data);
-  }
-};
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input_grad->dims()[0];
-    const int input_channels = input_grad->dims()[1];
-    const int input_height = input_grad->dims()[2];
-    const int input_width = input_grad->dims()[3];
-    const int output_height = output_grad.dims()[2];
-    const int output_width = output_grad.dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-
-    const T2* mask_data = mask.data<T2>();
-    const T1* output_grad_data = output_grad.data<T1>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
-
-    int nthreads = batch_size * input_channels * input_height * input_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxPool2DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads, output_grad_data, mask_data, input_channels, input_height,
-        input_width, output_height, output_width, ksize_height, ksize_width,
-        stride_height, stride_width, padding_height, padding_width,
-        input_grad_data);
-  }
-};
-
-template class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, float,
-                                         int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, float,
-                                             int>;
-template class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, double,
-                                         int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext,
-                                             double, int>;
-
-template <typename T1, typename T2>
-__global__ void KernelMaxPool3DWithIdx(
-    const int nthreads, const T1* input_data, const int channels,
-    const int input_depth, const int input_height, const int input_width,
-    const int output_depth, const int output_height, const int output_width,
-    const int ksize_depth, const int ksize_height, const int ksize_width,
-    const int stride_depth, const int stride_height, const int stride_width,
-    const int padding_depth, const int padding_height, const int padding_width,
-    T1* output_data, T2* mask_data) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-       index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
-    int ph = (index / output_width) % output_height;
-    int pd = (index / output_width / output_height) % output_depth;
-    int c = (index / output_width / output_height / output_depth) % channels;
-    int batch_idx =
-        index / output_width / output_height / output_depth / channels;
-
-    int dstart = pd * stride_depth - padding_depth;
-    int hstart = ph * stride_height - padding_height;
-    int wstart = pw * stride_width - padding_width;
-    int dend = min(dstart + ksize_depth, input_depth);
-    int hend = min(hstart + ksize_height, input_height);
-    int wend = min(wstart + ksize_width, input_width);
-    dstart = max(dstart, 0);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-
-    T1 ele = -FLT_MAX;
-    int max_index = -1;
-    input_data +=
-        (batch_idx * channels + c) * input_depth * input_height * input_width;
-
-    for (int d = dstart; d < dend; ++d) {
-      for (int h = hstart; h < hend; ++h) {
-        for (int w = wstart; w < wend; ++w) {
-          if (ele < input_data[(d * input_height + h) * input_width + w]) {
-            max_index = (d * input_height + h) * input_width + w;
-            ele = input_data[max_index];
-          }
-        }
-      }
-    }
-    output_data[index] = ele;
-    mask_data[index] = max_index;
-  }
-}
-
-template <typename T1, typename T2>
-__global__ void KernelMaxPool3DWithIdxGrad(
-    const int nthreads, const T1* output_grad, const T2* mask,
-    const int channels, const int input_depth, const int input_height,
-    const int input_width, const int output_depth, const int output_height,
-    const int output_width, const int ksize_depth, const int ksize_height,
-    const int ksize_width, const int stride_depth, const int stride_height,
-    const int stride_width, const int padding_depth, const int padding_height,
-    const int padding_width, T1* input_grad) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-       index += blockDim.x * gridDim.x) {
-    int w_offset = index % input_width;
-    int h_offset = (index / input_width) % input_height;
-    int d_offset = (index / input_width / input_height) % input_depth;
-    int c_offset =
-        (index / input_width / input_height / input_depth) % channels;
-    int batch_idx = index / input_width / input_height / input_depth / channels;
-
-    int pd_start =
-        (d_offset + padding_depth < ksize_depth)
-            ? 0
-            : (d_offset + padding_depth - ksize_depth) / stride_depth + 1;
-    int ph_start =
-        (h_offset + padding_height < ksize_height)
-            ? 0
-            : (h_offset + padding_height - ksize_height) / stride_height + 1;
-    int pw_start =
-        (w_offset + padding_width < ksize_width)
-            ? 0
-            : (w_offset + padding_width - ksize_width) / stride_width + 1;
-    int pd_end =
-        min((d_offset + padding_depth) / stride_depth + 1, output_depth);
-    int ph_end =
-        min((h_offset + padding_height) / stride_height + 1, output_height);
-    int pw_end =
-        min((w_offset + padding_width) / stride_width + 1, output_width);
-
-    T1 gradient = 0;
-    int input_current_feature_map_idx =
-        (d_offset * input_height + h_offset) * input_width + w_offset;
-    int output_idx = (batch_idx * channels + c_offset) * output_depth *
-                     output_height * output_width;
-    mask += output_idx;
-    output_grad += output_idx;
-
-    for (int pd = pd_start; pd < pd_end; ++pd) {
-      for (int ph = ph_start; ph < ph_end; ++ph) {
-        for (int pw = pw_start; pw < pw_end; ++pw) {
-          if (mask[(pd * output_height + ph) * output_width + pw] ==
-              input_current_feature_map_idx)
-            gradient +=
-                output_grad[(pd * output_height + ph) * output_width + pw];
-        }
-      }
-    }
-    input_grad[index] = gradient;
-  }
-}
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename T1, typename T2>
-class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* output, framework::Tensor* mask) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_depth = input.dims()[2];
-    const int input_height = input.dims()[3];
-    const int input_width = input.dims()[4];
-    const int output_channels = output->dims()[1];
-    const int output_depth = output->dims()[2];
-    const int output_height = output->dims()[3];
-    const int output_width = output->dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-
-    const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(context.GetPlace());
-    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
-
-    int nthreads = batch_size * output_channels * output_depth * output_height *
-                   output_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxPool3DWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_depth, input_height,
-        input_width, output_depth, output_height, output_width, ksize_depth,
-        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
-        padding_depth, padding_height, padding_width, output_data, mask_data);
-  }
-};
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input_grad->dims()[0];
-    const int input_channels = input_grad->dims()[1];
-    const int input_depth = input_grad->dims()[2];
-    const int input_height = input_grad->dims()[3];
-    const int input_width = input_grad->dims()[4];
-    const int output_depth = output_grad.dims()[2];
-    const int output_height = output_grad.dims()[3];
-    const int output_width = output_grad.dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-
-    const T1* output_grad_data = output_grad.data<T1>();
-    const T2* mask_data = mask.data<T2>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
-
-    int nthreads =
-        batch_size * input_channels * input_depth * input_height * input_width;
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxPool3DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads, output_grad_data, mask_data, input_channels, input_depth,
-        input_height, input_width, output_depth, output_height, output_width,
-        ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
-        stride_width, padding_depth, padding_height, padding_width,
-        input_grad_data);
-  }
-};
-
-template class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, float,
-                                         int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, float,
-                                             int>;
-template class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, double,
-                                         int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext,
-                                             double, int>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/pooling.h b/paddle/operators/math/pooling.h
deleted file mode 100644
index 2759f06cb6a51f7ceb6b8010d792030eb6ad5d3e..0000000000000000000000000000000000000000
--- a/paddle/operators/math/pooling.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-#define FLT_MAX \
-  __FLT_MAX__  // It might need to be placed in another file, but I'm still
-               // wondering where to put it.
-
-/*
- * \brief Extracting simple operations from pooling.
- *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
- * operation.
- *        MaxPool initializes temp variable to the negative maximum to find the
- * maximum value in the pooling field.
- *        AvgPool initializes temp variable to the zero to accumulate all values
- * in pool pooling, and finally takes the average.
- *        MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
- */
-template <class T>
-class MaxPool {
- public:
-  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
-  DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; }
-  DEVICE inline void finalize(T& y, const T& pool_field) {}
-};
-
-template <class T>
-class AvgPool {
- public:
-  DEVICE inline T initial() { return static_cast<T>(0); }
-  DEVICE inline void compute(T& y, const T& x) { y += x; }
-  DEVICE inline void finalize(T& y, const T& pool_field) { y /= pool_field; }
-};
-
-template <class T>
-class MaxPoolGrad {
- public:
-  DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
-                             T scale) {
-    dx += dy * (x == y);
-  }
-};
-
-template <class T>
-class AvgPoolGrad {
- public:
-  DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
-                             T scale) {
-    dx += (scale * dy);
-  }
-};
-
-/*
- * \brief Getting pooling results, and calculating gradient.
- *
- * In pool2d, all tensors are in NCHW format. Where N is batch size, C is the
- * number of channels, H and W is the height and width of feature.
- * In pool3d, all tensors are in NCDHW format. Where N is batch size, C is the
- * number of channels, D, H and W is the depth, height and width of feature.
- *
- * In max pooling, it is possible that the pooling region has multiple maximum
- * elements. In this case, we should compute the gradient of the first maximum
- * element.
- * This is different from average pooling. So we rewrite the max_pool_grad:
- * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
- */
-template <typename DeviceContext, typename PoolProcess, typename T>
-class Pool2dFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_compute,
-                  framework::Tensor* output);
-};
-
-template <typename DeviceContext, typename PoolProcess, typename T>
-class Pool2dGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_compute, framework::Tensor* input_grad);
-};
-
-template <typename DeviceContext, class T>
-class MaxPool2dGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* input_grad);
-};
-
-template <typename DeviceContext, typename PoolProcess, typename T>
-class Pool3dFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_compute,
-                  framework::Tensor* output);
-};
-
-template <typename DeviceContext, typename PoolProcess, typename T>
-class Pool3dGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_compute, framework::Tensor* input_grad);
-};
-
-template <typename DeviceContext, class T>
-class MaxPool3dGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* input_grad);
-};
-
-/*
- * \brief Getting max pooling results and corresponding max index, and
- * calculating gradient.
- * In up-sampling-pooling, it is necessary to know max element index.
- * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in
- * NCDHW format.
- */
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPool2dWithIndexFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, framework::Tensor* output,
-                  framework::Tensor* mask);
-};
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor {
- public:
-  void operator()(const DeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* input_grad);
-};
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPool3dWithIndexFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, framework::Tensor* output,
-                  framework::Tensor* mask);
-};
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor {
- public:
-  void operator()(const DeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* input_grad);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc
deleted file mode 100644
index 8a1ebb58c26578f076bf243adfbd51d10c682b99..0000000000000000000000000000000000000000
--- a/paddle/operators/math/selected_rows_functor.cc
+++ /dev/null
@@ -1,298 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <set>
-
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-template <typename T>
-struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const framework::SelectedRows& input2,
-                  framework::SelectedRows* output) {
-    auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2.height());
-    output->set_height(in1_height);
-
-    auto& in1_rows = input1.rows();
-    auto& in2_rows = input2.rows();
-    std::vector<int64_t> out_rows;
-    out_rows.reserve(in1_rows.size() + in2_rows.size());
-
-    // concat rows
-    out_rows.insert(out_rows.end(), in1_rows.begin(), in1_rows.end());
-    out_rows.insert(out_rows.end(), in2_rows.begin(), in2_rows.end());
-    output->set_rows(out_rows);
-
-    auto* out_value = output->mutable_value();
-    auto& in1_value = input1.value();
-    auto& in2_value = input2.value();
-
-    auto in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
-    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
-
-    auto in1_place = input1.place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
-    auto in2_place = input2.place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
-    auto out_place = context.GetPlace();
-    PADDLE_ENFORCE(platform::is_cpu_place(out_place));
-
-    auto* out_data = out_value->data<T>();
-    auto* in1_data = in1_value.data<T>();
-    memory::Copy(boost::get<platform::CPUPlace>(out_place), out_data,
-                 boost::get<platform::CPUPlace>(in1_place), in1_data,
-                 in1_value.numel() * sizeof(T));
-
-    auto* in2_data = in2_value.data<T>();
-    memory::Copy(boost::get<platform::CPUPlace>(out_place),
-                 out_data + in1_value.numel(),
-                 boost::get<platform::CPUPlace>(in2_place), in2_data,
-                 in2_value.numel() * sizeof(T));
-  }
-};
-
-template struct SelectedRowsAdd<platform::CPUDeviceContext, float>;
-template struct SelectedRowsAdd<platform::CPUDeviceContext, double>;
-
-template <typename T>
-struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const framework::Tensor& input2, framework::Tensor* output) {
-    auto in1_height = input1.height();
-    auto in2_dims = input2.dims();
-    auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
-
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
-    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
-
-    SetConstant<platform::CPUDeviceContext, T> functor;
-    functor(context, output, 0.0);
-
-    auto* in1_data = in1_value.data<T>();
-    auto* out_data = output->data<T>();
-
-    for (size_t i = 0; i < in1_rows.size(); i++) {
-      for (int64_t j = 0; j < in1_row_numel; j++) {
-        out_data[in1_rows[i] * in1_row_numel + j] +=
-            in1_data[i * in1_row_numel + j];
-      }
-    }
-
-    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
-    auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
-    out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
-  }
-};
-
-template struct SelectedRowsAddTensor<platform::CPUDeviceContext, float>;
-template struct SelectedRowsAddTensor<platform::CPUDeviceContext, double>;
-
-template <typename T>
-struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const int64_t input2_offset,
-                  framework::SelectedRows* input2) {
-    auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2->height());
-
-    auto& in1_rows = input1.rows();
-    auto& in2_rows = *(input2->mutable_rows());
-
-    auto& in1_value = input1.value();
-    auto* in2_value = input2->mutable_value();
-
-    // concat rows
-    in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end());
-
-    auto in1_place = input1.place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
-    auto in2_place = input2->place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
-
-    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = in2_value->data<T>();
-    memory::Copy(boost::get<platform::CPUPlace>(in2_place),
-                 in2_data + input2_offset,
-                 boost::get<platform::CPUPlace>(in1_place), in1_data,
-                 in1_value.numel() * sizeof(T));
-  }
-};
-
-template struct SelectedRowsAddTo<platform::CPUDeviceContext, float>;
-template struct SelectedRowsAddTo<platform::CPUDeviceContext, double>;
-template struct SelectedRowsAddTo<platform::CPUDeviceContext, int>;
-template struct SelectedRowsAddTo<platform::CPUDeviceContext, int64_t>;
-
-template <typename T>
-struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  framework::Tensor* input2) {
-    auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
-
-    auto* in1_data = in1_value.data<T>();
-    auto* input2_data = input2->data<T>();
-
-    for (size_t i = 0; i < in1_rows.size(); i++) {
-      for (int64_t j = 0; j < in1_row_numel; j++) {
-        input2_data[in1_rows[i] * in1_row_numel + j] +=
-            in1_data[i * in1_row_numel + j];
-      }
-    }
-  }
-};
-
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, float>;
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
-
-// This is a separated namespace for manipulate SelectedRows typed
-// data. Like merge duplicated rows, adding two SelectedRows etc.
-//
-// Another group of functors is called "scatter updates", which means
-// use SelectedRows to update a dense tensor with different Ops, like
-// add or mul.
-namespace scatter {
-
-size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
-  return std::find(rows.begin(), rows.end(), value) - rows.begin();
-}
-
-template <typename T>
-struct MergeAdd<platform::CPUDeviceContext, T> {
-  framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
-                                     const framework::SelectedRows& input) {
-    framework::SelectedRows out;
-    auto input_rows = input.rows();
-    std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
-    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
-
-    auto input_width = input.value().dims()[1];
-    out.set_rows(merge_rows);
-    out.set_height(input.height());
-    out.mutable_value()->mutable_data<T>(
-        framework::make_ddim(
-            {static_cast<int64_t>(merge_rows.size()), input_width}),
-        context.GetPlace());
-
-    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
-    constant_functor(context, out.mutable_value(), 0.0);
-
-    auto* out_data = out.mutable_value()->data<T>();
-    auto* input_data = input.value().data<T>();
-
-    for (size_t i = 0; i < input_rows.size(); i++) {
-      size_t out_i = FindPos(merge_rows, input_rows[i]);
-      for (int64_t j = 0; j < input_width; j++) {
-        out_data[out_i * input_width + j] += input_data[i * input_width + j];
-      }
-    }
-    return out;
-  }
-};
-
-template struct MergeAdd<platform::CPUDeviceContext, float>;
-template struct MergeAdd<platform::CPUDeviceContext, double>;
-template struct MergeAdd<platform::CPUDeviceContext, int>;
-template struct MergeAdd<platform::CPUDeviceContext, int64_t>;
-
-template <typename T>
-struct UpdateToTensor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const ScatterOps& op, const framework::SelectedRows& input1,
-                  framework::Tensor* input2) {
-    auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
-
-    auto* in1_data = in1_value.data<T>();
-    auto* input2_data = input2->data<T>();
-
-    // FIXME(typhoonzero): use macro fix the below messy code.
-    switch (op) {
-      case ScatterOps::ASSIGN:
-        INLINE_FOR2(in1_rows.size(), in1_row_numel)
-        input2_data[in1_rows[i] * in1_row_numel + j] =
-            in1_data[i * in1_row_numel + j];
-        break;
-      case ScatterOps::ADD:
-        INLINE_FOR2(in1_rows.size(), in1_row_numel)
-        input2_data[in1_rows[i] * in1_row_numel + j] +=
-            in1_data[i * in1_row_numel + j];
-        break;
-      case ScatterOps::SUB:
-        INLINE_FOR2(in1_rows.size(), in1_row_numel)
-        input2_data[in1_rows[i] * in1_row_numel + j] -=
-            in1_data[i * in1_row_numel + j];
-        break;
-      case ScatterOps::SUBBY:
-        INLINE_FOR2(in1_rows.size(), in1_row_numel)
-        input2_data[in1_rows[i] * in1_row_numel + j] =
-            in1_data[i * in1_row_numel + j] -
-            input2_data[in1_rows[i] * in1_row_numel + j];
-        break;
-      case ScatterOps::MUL:
-        INLINE_FOR2(in1_rows.size(), in1_row_numel)
-        input2_data[in1_rows[i] * in1_row_numel + j] *=
-            in1_data[i * in1_row_numel + j];
-        break;
-      case ScatterOps::DIV:
-        INLINE_FOR2(in1_rows.size(), in1_row_numel)
-        input2_data[in1_rows[i] * in1_row_numel + j] /=
-            in1_data[i * in1_row_numel + j];
-        break;
-      case ScatterOps::DIVBY:
-        INLINE_FOR2(in1_rows.size(), in1_row_numel)
-        input2_data[in1_rows[i] * in1_row_numel + j] =
-            in1_data[i * in1_row_numel + j] /
-            input2_data[in1_rows[i] * in1_row_numel + j];
-        break;
-    }
-  }
-};
-
-}  // namespace scatter
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu
deleted file mode 100644
index acdd87cb3550bc5f3891aed6fefd4301a3395f9f..0000000000000000000000000000000000000000
--- a/paddle/operators/math/selected_rows_functor.cu
+++ /dev/null
@@ -1,380 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <set>
-
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/selected_rows_functor.h"
-#include "paddle/platform/cuda_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-template <typename T>
-struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const framework::SelectedRows& input2,
-                  framework::SelectedRows* output) {
-    auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2.height());
-    output->set_height(in1_height);
-
-    framework::Vector<int64_t> in1_rows(input1.rows());
-    auto& in2_rows = input2.rows();
-    std::vector<int64_t> out_rows;
-    out_rows.reserve(in1_rows.size() + in2_rows.size());
-
-    // concat rows
-    out_rows.insert(out_rows.end(), in1_rows.begin(), in1_rows.end());
-    out_rows.insert(out_rows.end(), in2_rows.begin(), in2_rows.end());
-    output->set_rows(out_rows);
-
-    auto* out_value = output->mutable_value();
-    auto& in1_value = input1.value();
-    auto& in2_value = input2.value();
-
-    auto in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
-    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
-
-    auto* out_data = out_value->data<T>();
-    auto* in1_data = in1_value.data<T>();
-
-    auto in1_place = input1.place();
-    PADDLE_ENFORCE(platform::is_gpu_place(in1_place));
-    auto in2_place = input2.place();
-    PADDLE_ENFORCE(platform::is_gpu_place(in2_place));
-    auto out_place = context.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(out_place));
-
-    memory::Copy(
-        boost::get<platform::CUDAPlace>(out_place), out_data,
-        boost::get<platform::CUDAPlace>(in1_place), in1_data,
-        in1_value.numel() * sizeof(T),
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
-
-    auto* in2_data = in2_value.data<T>();
-    memory::Copy(boost::get<platform::CUDAPlace>(out_place),
-                 out_data + in1_value.numel(),
-                 boost::get<platform::CUDAPlace>(in2_place), in2_data,
-                 in2_value.numel() * sizeof(T), context.stream());
-  }
-};
-
-template struct SelectedRowsAdd<platform::CUDADeviceContext, float>;
-template struct SelectedRowsAdd<platform::CUDADeviceContext, double>;
-
-namespace {
-template <typename T, int block_size>
-__global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
-                                            const int64_t* rows, T* tensor_out,
-                                            int64_t row_numel) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  selected_rows += ty * row_numel;
-  tensor_out += rows[ty] * row_numel;
-
-  for (int index = tid; index < row_numel; index += block_size) {
-    // Since index in rows of SelectedRows can be duplicate, we can not use
-    // tensor_out[index] += selected_rows[index]; Instead, we have to use
-    // AtomicAdd to avoid concurrent write error.
-    paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
-  }
-}
-}  // namespace
-
-template <typename T>
-struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const framework::Tensor& input2, framework::Tensor* output) {
-    auto in1_height = input1.height();
-    auto in2_dims = input2.dims();
-    auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
-
-    auto& in1_value = input1.value();
-    framework::Vector<int64_t> in1_rows(input1.rows());
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
-    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
-
-    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = input2.data<T>();
-    auto* out_data = output->data<T>();
-
-    SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(context, output, 0.0);
-
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid(1, in1_rows.size());
-    SelectedRowsAddTensorKernel<
-        T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, in1_rows.cuda_data(), out_data, in1_row_numel);
-
-    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
-    auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
-    out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
-  }
-};
-
-template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
-template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
-
-template <typename T>
-struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const int64_t input2_offset,
-                  framework::SelectedRows* input2) {
-    auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2->height());
-
-    framework::Vector<int64_t> in1_rows(input1.rows());
-    auto& in2_rows = *(input2->mutable_rows());
-
-    auto& in1_value = input1.value();
-    auto* in2_value = input2->mutable_value();
-
-    // concat rows
-    in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end());
-
-    auto in1_place = input1.place();
-    PADDLE_ENFORCE(platform::is_gpu_place(in1_place));
-    auto in2_place = input2->place();
-    PADDLE_ENFORCE(platform::is_gpu_place(in2_place));
-
-    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = in2_value->data<T>();
-    memory::Copy(boost::get<platform::CUDAPlace>(in2_place),
-                 in2_data + input2_offset,
-                 boost::get<platform::CUDAPlace>(in1_place), in1_data,
-                 in1_value.numel() * sizeof(T), context.stream());
-  }
-};
-
-template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
-template struct SelectedRowsAddTo<platform::CUDADeviceContext, double>;
-template struct SelectedRowsAddTo<platform::CUDADeviceContext, int>;
-template struct SelectedRowsAddTo<platform::CUDADeviceContext, int64_t>;
-
-namespace {
-template <typename T, int block_size>
-__global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
-                                              const int64_t* rows,
-                                              T* tensor_out,
-                                              int64_t row_numel) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  selected_rows += ty * row_numel;
-  tensor_out += rows[ty] * row_numel;
-
-  for (int index = tid; index < row_numel; index += block_size) {
-    // Since index in rows of SelectedRows can be duplicate, we have to use
-    // Atomic Operation to avoid concurrent write error.
-    paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
-  }
-}
-}  // namespace
-
-template <typename T>
-struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  framework::Tensor* input2) {
-    auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-
-    auto& in1_value = input1.value();
-    framework::Vector<int64_t> in1_rows(input1.rows());
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
-
-    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = input2->data<T>();
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid(1, in1_rows.size());
-    SelectedRowsAddToTensorKernel<
-        T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, in1_rows.cuda_data(), in2_data, in1_row_numel);
-  }
-};
-
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
-
-namespace scatter {
-
-template <typename T, int block_size>
-__global__ void MergeAddKernel(const T* input, const int64_t* input_rows,
-                               T* out, const int64_t* out_rows,
-                               size_t out_rows_size, int64_t row_numel) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-  __shared__ size_t out_idx;
-
-  if (tid == 0) {
-    for (size_t i = 0; i < out_rows_size; i++) {
-      if (input_rows[ty] == out_rows[i]) {
-        out_idx = i;
-      }
-    }
-  }
-
-  __syncthreads();
-
-  input += ty * row_numel;
-  out += out_idx * row_numel;
-  for (int index = tid; index < row_numel; index += block_size) {
-    paddle::platform::CudaAtomicAdd(out + index, input[index]);
-  }
-}
-
-template <typename T>
-struct MergeAdd<platform::CUDADeviceContext, T> {
-  framework::SelectedRows operator()(const platform::CUDADeviceContext& context,
-                                     const framework::SelectedRows& input) {
-    framework::SelectedRows out;
-    framework::Vector<int64_t> input_rows(input.rows());
-    std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
-    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
-
-    auto input_width = input.value().dims()[1];
-
-    out.set_rows(merge_rows);
-    out.set_height(input.height());
-    out.mutable_value()->mutable_data<T>(
-        framework::make_ddim(
-            {static_cast<int64_t>(merge_rows.size()), input_width}),
-        context.GetPlace());
-
-    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
-    constant_functor(context, out.mutable_value(), 0.0);
-
-    auto* out_data = out.mutable_value()->data<T>();
-    auto* input_data = input.value().data<T>();
-
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid1(1, input_rows.size());
-
-    MergeAddKernel<
-        T, 256><<<grid1, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(input_data, input_rows.cuda_data(), out_data,
-                                   out.mutable_rows()->cuda_data(),
-                                   out.rows().size(), input_width);
-    return out;
-  }
-};
-
-template struct MergeAdd<platform::CUDADeviceContext, float>;
-template struct MergeAdd<platform::CUDADeviceContext, double>;
-template struct MergeAdd<platform::CUDADeviceContext, int>;
-template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
-
-template <typename T, int block_size>
-__global__ void UpdateToTensorKernel(const T* selected_rows,
-                                     const int64_t* rows, const ScatterOps& op,
-                                     T* tensor_out, int64_t row_numel) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  selected_rows += ty * row_numel;
-  tensor_out += rows[ty] * row_numel;
-  // FIXME(typhoonzero): use macro fix the below messy code.
-  switch (op) {
-    case ScatterOps::ASSIGN:
-      for (int index = tid; index < row_numel; index += block_size) {
-        tensor_out[index] = selected_rows[index];
-      }
-      break;
-    case ScatterOps::ADD:
-      for (int index = tid; index < row_numel; index += block_size) {
-        tensor_out[index] += selected_rows[index];
-      }
-      break;
-    case ScatterOps::SUB:
-      for (int index = tid; index < row_numel; index += block_size) {
-        tensor_out[index] -= selected_rows[index];
-      }
-      break;
-    case ScatterOps::SUBBY:
-      for (int index = tid; index < row_numel; index += block_size) {
-        tensor_out[index] = selected_rows[index] - tensor_out[index];
-      }
-      break;
-    case ScatterOps::MUL:
-      for (int index = tid; index < row_numel; index += block_size) {
-        tensor_out[index] *= selected_rows[index];
-      }
-      break;
-    case ScatterOps::DIV:
-      for (int index = tid; index < row_numel; index += block_size) {
-        tensor_out[index] /= selected_rows[index];
-      }
-      break;
-    case ScatterOps::DIVBY:
-      for (int index = tid; index < row_numel; index += block_size) {
-        tensor_out[index] = selected_rows[index] / tensor_out[index];
-      }
-      break;
-  }
-}
-
-template <typename T>
-struct UpdateToTensor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const ScatterOps& op, const framework::SelectedRows& input1,
-                  framework::Tensor* input2) {
-    // NOTE: Use SelectedRowsAddToTensor for better performance
-    //       no additional MergeAdd called.
-    MergeAdd<platform::CUDADeviceContext, T> merge_func;
-    auto merged_in1 = merge_func(context, input1);
-
-    auto in1_height = merged_in1.height();
-    auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-
-    auto& in1_value = merged_in1.value();
-    auto& in1_rows = merged_in1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
-
-    auto* in1_data = in1_value.template data<T>();
-    auto* in2_data = input2->data<T>();
-
-    dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1);
-    dim3 grid(1, in1_rows.size());
-    UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<<
-        grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(),
-                                              op, in2_data, in1_row_numel);
-  }
-};
-}  // namespace scatter
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor.h b/paddle/operators/math/selected_rows_functor.h
deleted file mode 100644
index 09d4631905f90f78772368ad71b11826877bdc34..0000000000000000000000000000000000000000
--- a/paddle/operators/math/selected_rows_functor.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/selected_rows.h"
-#include "paddle/platform/device_context.h"
-
-#define INLINE_FOR2(sizei, sizej)     \
-  for (int64_t i = 0; i < sizei; i++) \
-    for (int64_t j = 0; j < sizej; j++)
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-// SelectedRows + SelectedRows will simplely concat value and rows.
-// The real computation happens in dealing with LoDTensor.
-template <typename DeviceContext, typename T>
-struct SelectedRowsAdd {
-  void operator()(const DeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const framework::SelectedRows& input2,
-                  framework::SelectedRows* output);
-};
-
-template <typename DeviceContext, typename T>
-struct SelectedRowsAddTensor {
-  void operator()(const DeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const framework::Tensor& input2, framework::Tensor* output);
-};
-
-// input2 = input1 + input2
-template <typename DeviceContext, typename T>
-struct SelectedRowsAddTo {
-  void operator()(const DeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  const int64_t input2_offset, framework::SelectedRows* input2);
-};
-
-// input2 = input1 + input2
-template <typename DeviceContext, typename T>
-struct SelectedRowsAddToTensor {
-  void operator()(const DeviceContext& context,
-                  const framework::SelectedRows& input1,
-                  framework::Tensor* input2);
-};
-
-namespace scatter {
-// functors for manuplating SelectedRows data
-template <typename DeviceContext, typename T>
-struct MergeAdd {
-  // unary functor, merge by adding duplicated rows in
-  // the input SelectedRows object.
-  framework::SelectedRows operator()(const DeviceContext& context,
-                                     const framework::SelectedRows& input);
-};
-
-template <typename DeviceContext, typename T>
-struct Add {
-  framework::SelectedRows operator()(const DeviceContext& context,
-                                     const framework::SelectedRows& input1,
-                                     const framework::SelectedRows& input2) {
-    framework::SelectedRows out;
-    out.set_rows(input1.rows());
-    out.set_height(input1.height());
-    out.mutable_value()->mutable_data<T>(input1.value().dims(),
-                                         context.GetPlace());
-    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
-    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
-    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
-    e_out.device(*context.eigen_device()) = e_in1 + e_in2;
-    return out;
-  }
-};
-
-template <typename DeviceContext, typename T>
-struct Mul {
-  // multiply two SelectedRows
-  framework::SelectedRows operator()(const DeviceContext& context,
-                                     const framework::SelectedRows& input1,
-                                     const framework::SelectedRows& input2) {
-    framework::SelectedRows out;
-    out.set_rows(input1.rows());
-    out.set_height(input1.height());
-    out.mutable_value()->mutable_data<T>(input1.value().dims(),
-                                         context.GetPlace());
-    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
-    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
-    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
-    e_out.device(*context.eigen_device()) = e_in1 * e_in2;
-    return out;
-  }
-  // multiply scalar to SelectedRows
-  framework::SelectedRows operator()(const DeviceContext& context,
-                                     const framework::SelectedRows& input1,
-                                     const T input2) {
-    framework::SelectedRows out;
-    out.set_rows(input1.rows());
-    out.set_height(input1.height());
-    out.mutable_value()->mutable_data<T>(input1.value().dims(),
-                                         context.GetPlace());
-    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
-    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
-    e_out.device(*context.eigen_device()) = input2 * e_in1;
-    return out;
-  }
-};
-
-enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
-
-// out = seleted_rows_in / tensor
-template <typename DeviceContext, typename T>
-struct UpdateToTensor {
-  void operator()(const DeviceContext& context, const ScatterOps& op,
-                  const framework::SelectedRows& input1,
-                  framework::Tensor* input2);
-};
-
-}  // namespace scatter
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor_test.cc b/paddle/operators/math/selected_rows_functor_test.cc
deleted file mode 100644
index 8c74cab0a1e817f9e98fa682fe4122db7837aec9..0000000000000000000000000000000000000000
--- a/paddle/operators/math/selected_rows_functor_test.cc
+++ /dev/null
@@ -1,194 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/selected_rows_functor.h"
-#include "gtest/gtest.h"
-#include "paddle/operators/math/math_function.h"
-
-TEST(selected_rows_functor, cpu_add) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::operators::math;
-
-  CPUPlace cpu_place;
-  CPUDeviceContext ctx(cpu_place);
-  SetConstant<CPUDeviceContext, float> functor;
-  int64_t height = 10;
-  int64_t row_numel = 10;
-
-  std::vector<int64_t> rows1{0, 4, 7};
-  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
-  auto* in1_value = selected_rows1->mutable_value();
-  in1_value->mutable_data<float>(
-      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), cpu_place);
-  functor(ctx, in1_value, 1.0);
-
-  std::vector<int64_t> rows2{0, 5, 7, 9};
-  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
-  auto* in2_value = selected_rows2->mutable_value();
-  in2_value->mutable_data<float>(
-      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), cpu_place);
-  functor(ctx, in2_value, 2.0);
-
-  std::unique_ptr<SelectedRows> output{new SelectedRows()};
-  auto* out_value = output->mutable_value();
-
-  // simplely concat two SelectedRows
-  out_value->mutable_data<float>(make_ddim({7, 10}), cpu_place);
-
-  SelectedRowsAdd<CPUDeviceContext, float> add_functor;
-  add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
-
-  auto out_height = output->height();
-  EXPECT_EQ(out_height, height);
-
-  auto& out_rows = output->rows();
-
-  // input1 rows
-  EXPECT_EQ(out_rows[0], 0);
-  EXPECT_EQ(out_rows[1], 4);
-  EXPECT_EQ(out_rows[2], 7);
-  // input2 rows
-  EXPECT_EQ(out_rows[3], 0);
-  EXPECT_EQ(out_rows[4], 5);
-  EXPECT_EQ(out_rows[5], 7);
-  EXPECT_EQ(out_rows[6], 9);
-
-  auto* out_data = output->value().data<float>();
-  // input1 value
-  EXPECT_EQ(out_data[0 * row_numel + 0], 1.0);
-  EXPECT_EQ(out_data[0 * row_numel + 8], 1.0);
-  EXPECT_EQ(out_data[1 * row_numel + 1], 1.0);
-  EXPECT_EQ(out_data[2 * row_numel + 6], 1.0);
-  // input2 value
-  EXPECT_EQ(out_data[3 * row_numel + 3], 2.0);
-  EXPECT_EQ(out_data[3 * row_numel + 8], 2.0);
-  EXPECT_EQ(out_data[4 * row_numel + 4], 2.0);
-  EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
-  EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
-
-  std::unique_ptr<Tensor> tensor1{new Tensor()};
-  tensor1->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
-  functor(ctx, tensor1.get(), 3.0);
-
-  std::unique_ptr<Tensor> tensor2{new Tensor()};
-  tensor2->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
-
-  SelectedRowsAddTensor<CPUDeviceContext, float> add_tensor_functor;
-  add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
-
-  auto* tensor2_data = tensor2->data<float>();
-  // row0: 1.0 + 2.0 + 3.0
-  EXPECT_EQ(tensor2_data[0 * row_numel + 0], 6.0);
-  // row1: 3.0
-  EXPECT_EQ(tensor2_data[1 * row_numel + 1], 3.0);
-  // row4 : 1.0 + 3.0
-  EXPECT_EQ(tensor2_data[4 * row_numel + 6], 4.0);
-  // row5: 2.0 + 3.0
-  EXPECT_EQ(tensor2_data[5 * row_numel + 7], 5.0);
-  // row6: 3.0
-  EXPECT_EQ(tensor2_data[6 * row_numel + 1], 3.0);
-  // row7: 1.0 + 2.0 + 3.0
-  EXPECT_EQ(tensor2_data[7 * row_numel + 3], 6.0);
-  // row9: 2.0 + 3.0
-  EXPECT_EQ(tensor2_data[9 * row_numel + 6], 5.0);
-}
-
-TEST(selected_rows_functor, cpu_add_to) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::operators::math;
-
-  CPUPlace cpu_place;
-  CPUDeviceContext ctx(cpu_place);
-  SetConstant<CPUDeviceContext, float> functor;
-  int64_t height = 10;
-  int64_t row_numel = 10;
-
-  std::vector<int64_t> rows1{0, 4, 7};
-  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
-  auto* in1_value = selected_rows1->mutable_value();
-  in1_value->mutable_data<float>(
-      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), cpu_place);
-  functor(ctx, in1_value, 1.0);
-
-  std::vector<int64_t> rows2{0, 5, 7, 9};
-  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
-  auto* in2_value = selected_rows2->mutable_value();
-  in2_value->mutable_data<float>(
-      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), cpu_place);
-  functor(ctx, in2_value, 2.0);
-
-  std::unique_ptr<SelectedRows> output{new SelectedRows()};
-  output->set_height(height);
-  auto* out_value = output->mutable_value();
-
-  // simplely concat two SelectedRows
-  out_value->mutable_data<float>(make_ddim({7, 10}), cpu_place);
-
-  SelectedRowsAddTo<CPUDeviceContext, float> add_to_functor;
-  add_to_functor(ctx, *selected_rows1, 0, output.get());
-  add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
-
-  auto out_height = output->height();
-  EXPECT_EQ(out_height, height);
-
-  auto& out_rows = output->rows();
-
-  // input1 rows
-  EXPECT_EQ(out_rows[0], 0);
-  EXPECT_EQ(out_rows[1], 4);
-  EXPECT_EQ(out_rows[2], 7);
-  // input2 rows
-  EXPECT_EQ(out_rows[3], 0);
-  EXPECT_EQ(out_rows[4], 5);
-  EXPECT_EQ(out_rows[5], 7);
-  EXPECT_EQ(out_rows[6], 9);
-
-  auto* out_data = output->value().data<float>();
-  // input1 value
-  EXPECT_EQ(out_data[0 * row_numel + 0], 1.0);
-  EXPECT_EQ(out_data[0 * row_numel + 8], 1.0);
-  EXPECT_EQ(out_data[1 * row_numel + 1], 1.0);
-  EXPECT_EQ(out_data[2 * row_numel + 6], 1.0);
-  // input2 value
-  EXPECT_EQ(out_data[3 * row_numel + 3], 2.0);
-  EXPECT_EQ(out_data[3 * row_numel + 8], 2.0);
-  EXPECT_EQ(out_data[4 * row_numel + 4], 2.0);
-  EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
-  EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
-
-  std::unique_ptr<Tensor> tensor1{new Tensor()};
-  tensor1->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
-  functor(ctx, tensor1.get(), 3.0);
-
-  SelectedRowsAddToTensor<CPUDeviceContext, float> add_to_tensor_functor;
-  add_to_tensor_functor(ctx, *output, tensor1.get());
-
-  auto* tensor1_data = tensor1->data<float>();
-  // row0: 1.0 + 2.0 + 3.0
-  EXPECT_EQ(tensor1_data[0 * row_numel + 0], 6.0);
-  // row1: 3.0
-  EXPECT_EQ(tensor1_data[1 * row_numel + 1], 3.0);
-  // row4 : 1.0 + 3.0
-  EXPECT_EQ(tensor1_data[4 * row_numel + 6], 4.0);
-  // row5: 2.0 + 3.0
-  EXPECT_EQ(tensor1_data[5 * row_numel + 7], 5.0);
-  // row6: 3.0
-  EXPECT_EQ(tensor1_data[6 * row_numel + 1], 3.0);
-  // row7: 1.0 + 2.0 + 3.0
-  EXPECT_EQ(tensor1_data[7 * row_numel + 3], 6.0);
-  // row9: 2.0 + 3.0
-  EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0);
-}
diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu
deleted file mode 100644
index 38808e13014c581fb3d10c3e712f1dd1fa6523e2..0000000000000000000000000000000000000000
--- a/paddle/operators/math/selected_rows_functor_test.cu
+++ /dev/null
@@ -1,212 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gtest/gtest.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/selected_rows_functor.h"
-
-TEST(selected_rows_functor, gpu_add) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::operators::math;
-
-  CUDAPlace gpu_place(0);
-  CPUPlace cpu_place;
-  CUDADeviceContext ctx(gpu_place);
-  SetConstant<CUDADeviceContext, float> functor;
-  int64_t height = 10;
-  int64_t row_numel = 10;
-
-  std::vector<int64_t> rows1{0, 4, 7};
-  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
-  auto* in1_value = selected_rows1->mutable_value();
-  in1_value->mutable_data<float>(
-      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), gpu_place);
-  functor(ctx, in1_value, 1.0);
-
-  std::vector<int64_t> rows2{0, 5, 7, 9};
-  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
-  auto* in2_value = selected_rows2->mutable_value();
-  in2_value->mutable_data<float>(
-      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), gpu_place);
-  functor(ctx, in2_value, 2.0);
-
-  std::unique_ptr<SelectedRows> output{new SelectedRows()};
-  auto* out_value = output->mutable_value();
-
-  // simplely concat two SelectedRows
-  out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place);
-
-  SelectedRowsAdd<CUDADeviceContext, float> add_functor;
-  add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
-
-  auto out_height = output->height();
-  EXPECT_EQ(out_height, height);
-
-  auto& out_rows = output->rows();
-
-  // input1 rows
-  EXPECT_EQ(out_rows[0], 0);
-  EXPECT_EQ(out_rows[1], 4);
-  EXPECT_EQ(out_rows[2], 7);
-  // input2 rows
-  EXPECT_EQ(out_rows[3], 0);
-  EXPECT_EQ(out_rows[4], 5);
-  EXPECT_EQ(out_rows[5], 7);
-  EXPECT_EQ(out_rows[6], 9);
-
-  Tensor out_cpu;
-  Copy(*out_value, cpu_place, ctx, &out_cpu);
-  ctx.Wait();
-
-  auto* out_cpu_data = out_cpu.data<float>();
-  // input1 value
-  EXPECT_EQ(out_cpu_data[0 * row_numel + 0], 1.0);
-  EXPECT_EQ(out_cpu_data[0 * row_numel + 8], 1.0);
-  EXPECT_EQ(out_cpu_data[1 * row_numel + 1], 1.0);
-  EXPECT_EQ(out_cpu_data[2 * row_numel + 6], 1.0);
-  // input2 value
-  EXPECT_EQ(out_cpu_data[3 * row_numel + 3], 2.0);
-  EXPECT_EQ(out_cpu_data[3 * row_numel + 8], 2.0);
-  EXPECT_EQ(out_cpu_data[4 * row_numel + 4], 2.0);
-  EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0);
-  EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
-
-  std::unique_ptr<Tensor> tensor1{new Tensor()};
-  tensor1->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
-  functor(ctx, tensor1.get(), 3.0);
-
-  std::unique_ptr<Tensor> tensor2{new Tensor()};
-  tensor2->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
-
-  SelectedRowsAddTensor<CUDADeviceContext, float> add_tensor_functor;
-  add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
-
-  Tensor tensor2_cpu;
-  Copy(*tensor2, cpu_place, ctx, &tensor2_cpu);
-  ctx.Wait();
-
-  auto* tensor2_cpu_data = tensor2_cpu.data<float>();
-  // row0: 1.0 + 2.0 + 3.0
-  EXPECT_EQ(tensor2_cpu_data[0 * row_numel + 0], 6.0);
-  // row1: 3.0
-  EXPECT_EQ(tensor2_cpu_data[1 * row_numel + 1], 3.0);
-  // row4 : 1.0 + 3.0
-  EXPECT_EQ(tensor2_cpu_data[4 * row_numel + 6], 4.0);
-  // row5: 2.0 + 3.0
-  EXPECT_EQ(tensor2_cpu_data[5 * row_numel + 7], 5.0);
-  // row6: 3.0
-  EXPECT_EQ(tensor2_cpu_data[6 * row_numel + 1], 3.0);
-  // row7: 1.0 + 2.0 + 3.0
-  EXPECT_EQ(tensor2_cpu_data[7 * row_numel + 3], 6.0);
-  // row9: 2.0 + 3.0
-  EXPECT_EQ(tensor2_cpu_data[9 * row_numel + 6], 5.0);
-}
-
-TEST(selected_rows_functor, gpu_add_to) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::operators::math;
-
-  CUDAPlace gpu_place(0);
-  CPUPlace cpu_place;
-  CUDADeviceContext ctx(gpu_place);
-  SetConstant<CUDADeviceContext, float> functor;
-  int64_t height = 10;
-  int64_t row_numel = 10;
-
-  std::vector<int64_t> rows1{0, 4, 7};
-  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
-  auto* in1_value = selected_rows1->mutable_value();
-  in1_value->mutable_data<float>(
-      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), gpu_place);
-  functor(ctx, in1_value, 1.0);
-
-  std::vector<int64_t> rows2{0, 5, 7, 9};
-  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
-  auto* in2_value = selected_rows2->mutable_value();
-  in2_value->mutable_data<float>(
-      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), gpu_place);
-  functor(ctx, in2_value, 2.0);
-
-  std::unique_ptr<SelectedRows> output{new SelectedRows()};
-  output->set_height(height);
-  auto* out_value = output->mutable_value();
-
-  // simplely concat two SelectedRows
-  out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place);
-
-  SelectedRowsAddTo<CUDADeviceContext, float> add_to_functor;
-  add_to_functor(ctx, *selected_rows1, 0, output.get());
-  add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
-
-  auto out_height = output->height();
-  EXPECT_EQ(out_height, height);
-
-  auto& out_rows = output->rows();
-
-  // input1 rows
-  EXPECT_EQ(out_rows[0], 0);
-  EXPECT_EQ(out_rows[1], 4);
-  EXPECT_EQ(out_rows[2], 7);
-  // input2 rows
-  EXPECT_EQ(out_rows[3], 0);
-  EXPECT_EQ(out_rows[4], 5);
-  EXPECT_EQ(out_rows[5], 7);
-  EXPECT_EQ(out_rows[6], 9);
-
-  Tensor out_cpu;
-  Copy(*out_value, cpu_place, ctx, &out_cpu);
-  ctx.Wait();
-
-  auto* out_cpu_data = out_cpu.data<float>();
-  // input1 value
-  EXPECT_EQ(out_cpu_data[0 * row_numel + 0], 1.0);
-  EXPECT_EQ(out_cpu_data[0 * row_numel + 8], 1.0);
-  EXPECT_EQ(out_cpu_data[1 * row_numel + 1], 1.0);
-  EXPECT_EQ(out_cpu_data[2 * row_numel + 6], 1.0);
-  // input2 value
-  EXPECT_EQ(out_cpu_data[3 * row_numel + 3], 2.0);
-  EXPECT_EQ(out_cpu_data[3 * row_numel + 8], 2.0);
-  EXPECT_EQ(out_cpu_data[4 * row_numel + 4], 2.0);
-  EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0);
-  EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
-
-  std::unique_ptr<Tensor> tensor1{new Tensor()};
-  tensor1->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
-  functor(ctx, tensor1.get(), 3.0);
-
-  SelectedRowsAddToTensor<CUDADeviceContext, float> add_to_tensor_functor;
-  add_to_tensor_functor(ctx, *output, tensor1.get());
-
-  Tensor tensor1_cpu;
-  Copy(*tensor1, cpu_place, ctx, &tensor1_cpu);
-  ctx.Wait();
-
-  auto* tensor1_cpu_data = tensor1_cpu.data<float>();
-  // row0: 1.0 + 2.0 + 3.0
-  EXPECT_EQ(tensor1_cpu_data[0 * row_numel + 0], 6.0);
-  // row1: 3.0
-  EXPECT_EQ(tensor1_cpu_data[1 * row_numel + 1], 3.0);
-  // row4 : 1.0 + 3.0
-  EXPECT_EQ(tensor1_cpu_data[4 * row_numel + 6], 4.0);
-  // row5: 2.0 + 3.0
-  EXPECT_EQ(tensor1_cpu_data[5 * row_numel + 7], 5.0);
-  // row6: 3.0
-  EXPECT_EQ(tensor1_cpu_data[6 * row_numel + 1], 3.0);
-  // row7: 1.0 + 2.0 + 3.0
-  EXPECT_EQ(tensor1_cpu_data[7 * row_numel + 3], 6.0);
-  // row9: 2.0 + 3.0
-  EXPECT_EQ(tensor1_cpu_data[9 * row_numel + 6], 5.0);
-}
diff --git a/paddle/operators/math/sequence2batch.cc b/paddle/operators/math/sequence2batch.cc
deleted file mode 100644
index 17abce1c2f809f75edb2c5dc46709094c2ce10c3..0000000000000000000000000000000000000000
--- a/paddle/operators/math/sequence2batch.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/sequence2batch.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
-                  bool is_src_index) {
-    size_t* index = index_lod.data();
-    auto src_dims = src.dims();
-    auto dst_dims = dst.dims();
-    PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
-                      "The src must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL,
-                      "The dst must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
-                      "The width of src and dst must be same.");
-    auto height = dst_dims[0];
-    auto width = dst_dims[1];
-    auto* src_data = src.data<T>();
-    auto* dst_data = dst.data<T>();
-    for (int i = 0; i < height; ++i) {
-      if (is_src_index) {
-        memcpy(dst_data + i * width, src_data + index[i] * width,
-               width * sizeof(T));
-      } else {
-        memcpy(dst_data + index[i] * width, src_data + i * width,
-               width * sizeof(T));
-      }
-    }
-  }
-};
-
-template class CopyMatrixRowsFunctor<platform::CPUDeviceContext, float>;
-template class CopyMatrixRowsFunctor<platform::CPUDeviceContext, double>;
-
-template class LoDTensor2BatchFunctor<platform::CPUDeviceContext, float>;
-template class LoDTensor2BatchFunctor<platform::CPUDeviceContext, double>;
-template class Batch2LoDTensorFunctor<platform::CPUDeviceContext, float>;
-template class Batch2LoDTensorFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/sequence2batch.cu b/paddle/operators/math/sequence2batch.cu
deleted file mode 100644
index f27631271a42b4d64abef00d7f119b85e32edda4..0000000000000000000000000000000000000000
--- a/paddle/operators/math/sequence2batch.cu
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/math/sequence2batch.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
-__global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index,
-                                     int64_t height, int64_t width,
-                                     bool is_src_index) {
-  int idx = threadIdx.x;
-  int idy = threadIdx.y;
-  int id = blockIdx.x + idy * GridDimX;
-  while (id < height) {
-    int src_idx = is_src_index ? index[id] : id;
-    int dst_idx = is_src_index ? id : index[id];
-    const T* src_data = src + src_idx * width;
-    T* dst_data = dst + dst_idx * width;
-    for (int i = idx; i < width; i += BlockDimX) {
-      dst_data[i] = src_data[i];
-    }
-    id += BlockDimY * GridDimX;
-  }
-}
-
-template <typename T>
-class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
-                  bool is_src_index) {
-    size_t* index = index_lod.cuda_data();
-    auto src_dims = src.dims();
-    auto dst_dims = dst.dims();
-    PADDLE_ENFORCE_EQ(src_dims.size(), 2,
-                      "The src must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(dst_dims.size(), 2,
-                      "The dst must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
-                      "The width of src and dst must be same.");
-    auto height = dst_dims[0];
-    auto width = dst_dims[1];
-    auto* src_data = src.data<T>();
-    auto* dst_data = dst.data<T>();
-
-    dim3 threads(128, 8);
-    dim3 grid(8, 1);
-    auto stream = context.stream();
-    CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>(
-        src_data, dst_data, index, height, width, is_src_index);
-  }
-};
-
-template class CopyMatrixRowsFunctor<platform::CUDADeviceContext, float>;
-template class CopyMatrixRowsFunctor<platform::CUDADeviceContext, double>;
-
-template class LoDTensor2BatchFunctor<platform::CUDADeviceContext, float>;
-template class LoDTensor2BatchFunctor<platform::CUDADeviceContext, double>;
-template class Batch2LoDTensorFunctor<platform::CUDADeviceContext, float>;
-template class Batch2LoDTensorFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/sequence2batch.h b/paddle/operators/math/sequence2batch.h
deleted file mode 100644
index 6db0427b4174a09dd254d771e8d3d215cc6571a9..0000000000000000000000000000000000000000
--- a/paddle/operators/math/sequence2batch.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class CopyMatrixRowsFunctor {
- public:
-  // If is_src_index is true,
-  // copy the indexed rows of input src to the output dst.
-  // If is_src_index is false,
-  // copy the input src to the indexed rows of output dst.
-  // The indexed rows are based on the input index.
-  void operator()(const DeviceContext& context, const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
-                  bool is_src_index);
-};
-
-template <typename DeviceContext, typename T>
-class LoDTensor2BatchFunctor {
-  // Calculate the length of each sequence and
-  // sort sequence index by the length.
-  // example:  sequences = {s0, s1, s2}
-  //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-  //           seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
-  //
-  struct SeqInfo {
-    SeqInfo(int start, int length, int seq_idx)
-        : start(start), length(length), seq_idx(seq_idx) {}
-    int start;
-    int length;
-    int seq_idx;
-  };
-
- public:
-  void operator()(const DeviceContext& context,
-                  const framework::LoDTensor& lod_tensor,
-                  framework::LoDTensor& batch, bool is_cal_batch_lod,
-                  bool is_reverse = false) const {
-    if (!is_cal_batch_lod) {
-      auto lods = batch.lod();
-      PADDLE_ENFORCE_GT(lods.size(), 2UL);
-      PADDLE_ENFORCE_EQ(lods[1].size(),
-                        static_cast<size_t>(lod_tensor.dims()[0]));
-      CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
-      to_batch(context, lod_tensor, lods[1], batch, true);
-      return;
-    }
-
-    auto lods = lod_tensor.lod();
-    auto lod = lods[0];
-    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
-
-    std::vector<SeqInfo> seq_info;
-    for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
-      int length = lod[seq_id + 1] - lod[seq_id];
-      seq_info.emplace_back(lod[seq_id], length, seq_id);
-    }
-
-    std::sort(seq_info.begin(), seq_info.end(),
-              [](SeqInfo a, SeqInfo b) { return a.length > b.length; });
-
-    // Calculate the start position of each batch.
-    // example:  sequences = {s0, s1, s2}
-    //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-    //           num_batch = 5,
-    //           batchIndex = {b0, b1, b2, b3, b4}
-    //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
-    //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
-    //              batch_start_positions[0] = len(b0)
-    //              batch_start_positions[1] = len(b0) + len(b1)
-    //              batch_start_positions[2] = len(b0) + len(b1) + len(b2)
-    //              ...
-    //           seq2batch_idx[12] = {4, 0, 9,
-    //                                5, 1, 10,
-    //                                6, 2, 11,
-    //                                7, 3,
-    //                                8}
-    //           seq_order = {1, 0, 2}, the sort order.
-    //               where 1 is the second sequence,
-    //                     0 is the first sequence,
-    //                     2 is the third sequence.
-    // The num_batch represents batch size after rearranging the
-    // input LodTensor. It is also the maximum length of input sequence.
-
-    paddle::framework::LoD batch_lods;
-    batch_lods.emplace_back(std::vector<size_t>{0});
-    batch_lods.emplace_back(std::vector<size_t>{0});
-    batch_lods.emplace_back(std::vector<size_t>{0});
-
-    // batch_lods[0] is the start positions for batch LoDTensor
-    int num_batch = seq_info[0].length;
-    batch_lods[0].resize(static_cast<size_t>(num_batch + 1));
-    // batch_lods[1] is the raw index in the input LoDTensor
-    batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
-    // batch_lods[2] is the sort order for the input LoDTensor.
-    batch_lods[2].resize(seq_info.size());
-
-    size_t* batch_starts = batch_lods[0].data();
-    size_t* seq2batch_idx = batch_lods[1].data();
-    batch_starts[0] = 0;
-    for (int n = 0; n < num_batch; n++) {
-      auto batch_id = static_cast<int>(batch_starts[n]);
-      for (size_t i = 0; i < seq_info.size(); ++i) {
-        int seq_len = seq_info[i].length;
-        int start = seq_info[i].start;
-        if (n < seq_len) {
-          seq2batch_idx[batch_id] =
-              is_reverse ? start + seq_len - 1 - n : start + n;
-          batch_id++;
-        } else {
-          break;
-        }
-      }
-      batch_starts[n + 1] = static_cast<size_t>(batch_id);
-    }
-    size_t* seq_order = batch_lods[2].data();
-    for (size_t i = 0; i < seq_info.size(); ++i) {
-      seq_order[i] = seq_info[i].seq_idx;
-    }
-    batch.set_lod(batch_lods);
-
-    CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
-    to_batch(context, lod_tensor, batch_lods[1], batch, true);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Batch2LoDTensorFunctor {
- public:
-  void operator()(const DeviceContext& context,
-                  const framework::LoDTensor& batch,
-                  framework::LoDTensor& lod_tensor) const {
-    auto in_lod = batch.lod();
-    PADDLE_ENFORCE_GT(in_lod.size(), 2UL);
-    PADDLE_ENFORCE_EQ(in_lod[1].size(),
-                      static_cast<size_t>(lod_tensor.dims()[0]));
-    CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
-    to_seq(context, batch, in_lod[1], lod_tensor, false);
-  }
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/sequence_padding.cc b/paddle/operators/math/sequence_padding.cc
deleted file mode 100644
index 2e69aa47eb8a060a6cac6588b2f37960898aba92..0000000000000000000000000000000000000000
--- a/paddle/operators/math/sequence_padding.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/sequence_padding.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& seq, framework::Tensor& padding,
-                  bool norm_by_times) {
-    auto lod = seq.lod();
-    PADDLE_ENFORCE_GT(lod.size(), 0UL,
-                      "The LoD of LoDTensor seq should not be null.");
-
-    const size_t level = 0;
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-
-    auto seq_dims = seq.dims();
-    PADDLE_ENFORCE_EQ(seq_dims[0],
-                      static_cast<int64_t>(abs_offset_lod[level].back()),
-                      "The first dimension of LoDTensor seq should be "
-                      "equal to the sum of all sequences's length.");
-
-    auto padding_dims = padding.dims();
-    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
-                      "The input padding should be a 3-D Tensor of shape "
-                      "[max_sequence_length, num_sequences, sequence_width].");
-
-    const int64_t max_sequence_length = MaximumSequenceLength(lod, level);
-    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
-                      "The first dimension of Tensor padding should be the "
-                      "maximum length of all sequences in LoDTensor seq.");
-
-    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
-    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
-                      "The second dimension of Tensor padding should be the "
-                      "number of sequences in LoDTensor seq.");
-
-    const int64_t sequence_width = seq.numel() / seq_dims[0];
-    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
-                      "The third dimension of Tensor padding should be the "
-                      "width of sequence in LoDTensor seq.");
-
-    const T* seq_data = seq.data<T>();
-    T* padding_data = padding.data<T>();
-    for (int64_t i = 0; i < max_sequence_length; ++i) {
-      for (int64_t j = 0; j < num_sequences; ++j) {
-        int64_t start_pos = abs_offset_lod[level][j];
-        int64_t sequence_length = abs_offset_lod[level][j + 1] - start_pos;
-        if (i < sequence_length) {
-          // i > 0 => sequence_length > 0
-          T scale =
-              norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
-          for (int64_t k = 0; k < sequence_width; ++k) {
-            padding_data[(i * num_sequences + j) * sequence_width + k] =
-                seq_data[(start_pos + i) * sequence_width + k] * scale;
-          }
-        } else {
-          memset(padding_data + (i * num_sequences + j) * sequence_width, 0,
-                 sequence_width * sizeof(T));
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  framework::LoDTensor& seq, const framework::Tensor& padding,
-                  bool norm_by_times) {
-    auto lod = seq.lod();
-    PADDLE_ENFORCE_GT(lod.size(), 0UL,
-                      "The LoD of LoDTensor seq should not be null.");
-
-    const size_t level = 0;
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-
-    auto seq_dims = seq.dims();
-    PADDLE_ENFORCE_EQ(seq_dims[0],
-                      static_cast<int64_t>(abs_offset_lod[level].back()),
-                      "The first dimension of LoDTensor seq should be "
-                      "equal to the sum of all sequences's length.");
-
-    auto padding_dims = padding.dims();
-    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
-                      "The input padding should be a 3-D Tensor of shape "
-                      "[max_sequnece_length, num_sequences, sequence_width].");
-
-    const int64_t max_sequence_length = MaximumSequenceLength(lod, level);
-    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
-                      "The first dimension of Tensor padding should be "
-                      "the maximum length of all sequences in LoDTensor seq.");
-
-    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
-    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
-                      "The second dimension of Tensor padding should be "
-                      "the number of sequences in LoDTensor seq.");
-
-    const int64_t sequence_width = seq.numel() / seq_dims[0];
-    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
-                      "The third dimension of Tensor padding should be the "
-                      "width of sequence in LoDTensor seq.");
-
-    const T* padding_data = padding.data<T>();
-    T* seq_data = seq.data<T>();
-    for (int64_t i = 0; i < num_sequences; ++i) {
-      int64_t start_pos = abs_offset_lod[level][i];
-      int64_t sequence_length = abs_offset_lod[level][i + 1] - start_pos;
-      for (int64_t j = 0; j < sequence_length; ++j) {
-        // sequence_width > j > 0
-        T scale =
-            norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
-        for (int64_t k = 0; k < sequence_width; ++k) {
-          seq_data[(start_pos + j) * sequence_width + k] =
-              padding_data[(j * num_sequences + i) * sequence_width + k] *
-              scale;
-        }
-      }
-    }
-  }
-};
-
-template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
-template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/sequence_padding.cu b/paddle/operators/math/sequence_padding.cu
deleted file mode 100644
index 65c9cfe4a0ec14d220ad237baa71703a783ed0fa..0000000000000000000000000000000000000000
--- a/paddle/operators/math/sequence_padding.cu
+++ /dev/null
@@ -1,215 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/sequence_padding.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T, bool NormByTimes, bool Padding>
-__global__ void SequencePaddingKernel(T* padding, T* sequence,
-                                      const size_t* sequence_start_positions,
-                                      const size_t sequence_width,
-                                      const size_t max_sequence_length,
-                                      const size_t num_sequences) {
-  size_t padding_idx = blockIdx.y;
-  size_t start_pos = sequence_start_positions[padding_idx];
-  size_t sequence_length =
-      sequence_start_positions[padding_idx + 1] - start_pos;
-
-  size_t sequence_idx = blockIdx.x * blockDim.y + threadIdx.y;
-  size_t padding_base_idx =
-      (sequence_idx * num_sequences + padding_idx) * sequence_width;
-  size_t sequence_base_idx = (start_pos + sequence_idx) * sequence_width;
-
-  if (sequence_idx < sequence_length) {
-    T scale = NormByTimes ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
-    if (Padding) {
-      /* sequence -> padding */
-      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
-        padding[padding_base_idx + i] = scale * sequence[sequence_base_idx + i];
-      }
-    } else {
-      /* padding -> sequence */
-      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
-        sequence[sequence_base_idx + i] = scale * padding[padding_base_idx + i];
-      }
-    }
-  } else if (sequence_idx < max_sequence_length) {
-    if (Padding) {
-      /* sequence -> padding */
-      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
-        padding[padding_base_idx + i] = 0;
-      }
-    }
-  }
-}
-
-template <typename T>
-class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::LoDTensor& seq, framework::Tensor& padding,
-                  bool norm_by_times) {
-    auto lod = seq.lod();
-    PADDLE_ENFORCE_GT(lod.size(), 0UL,
-                      "The lod of LoDTensor seq should not be null.");
-
-    const size_t level = 0;
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-
-    auto seq_dims = seq.dims();
-    PADDLE_ENFORCE_EQ(seq_dims[0],
-                      static_cast<int64_t>(abs_offset_lod[level].back()),
-                      "The first dimension of LoDTensor seq should be "
-                      "equal to the sum of all sequences's length.");
-
-    auto padding_dims = padding.dims();
-    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
-                      "The input padding should be a 3-D Tensor of shape "
-                      "[max_sequence_length, num_sequences, sequence_width].");
-
-    int64_t max_sequence_length = MaximumSequenceLength(lod, level);
-    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
-                      "The first dimension of Tensor padding should be the "
-                      "maximum length of all sequences in LoDTensor seq.");
-
-    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
-    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
-                      "The second dimension of Tensor padding should be the "
-                      "number of sequences in LoDTensor seq.");
-
-    const int64_t sequence_width = seq.numel() / seq_dims[0];
-    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
-                      "The third dimension of Tensor padding should be the "
-                      "width of sequence in LoDTensor seq.");
-
-    if (!norm_by_times && num_sequences == 1UL) {
-      Copy(seq, context.GetPlace(), context, &padding);
-      padding.Resize(padding_dims);
-      return;
-    }
-
-    const int64_t kBlockSize = 512;
-
-    /* At least use 32 threads to copy sequence_width elements,
-     * and at least 8 elements for each thread.
-     */
-    size_t block_dim_x =
-        std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
-    size_t block_dim_y = kBlockSize / block_dim_x;
-    dim3 threads(block_dim_x, block_dim_y);
-
-    size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y;
-    size_t grid_dim_y = num_sequences;
-    dim3 grid(grid_dim_x, grid_dim_y);
-
-    const T* seq_data = seq.data<T>();
-    T* padding_data = padding.data<T>();
-    if (norm_by_times) {
-      SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
-          padding_data, const_cast<T*>(seq_data),
-          abs_offset_lod[level].cuda_data(), sequence_width,
-          max_sequence_length, num_sequences);
-    } else {
-      SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
-          padding_data, const_cast<T*>(seq_data),
-          abs_offset_lod[level].cuda_data(), sequence_width,
-          max_sequence_length, num_sequences);
-    }
-  }
-};
-
-template <typename T>
-class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  framework::LoDTensor& seq, const framework::Tensor& padding,
-                  bool norm_by_times) {
-    auto lod = seq.lod();
-    PADDLE_ENFORCE_GT(lod.size(), 0UL,
-                      "The lod of LoDTensor seq should not be null.");
-
-    const size_t level = 0;
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-
-    auto seq_dims = seq.dims();
-    PADDLE_ENFORCE_EQ(seq_dims[0],
-                      static_cast<int64_t>(abs_offset_lod[level].back()),
-                      "The first dimension of LoDTensor seq should be "
-                      "equal to the sum of all sequences's length.");
-
-    auto padding_dims = padding.dims();
-    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
-                      "The input padding should be a 3-D Tensor of shape "
-                      "[max_sequnece_length, num_sequences, sequence_width].");
-
-    int64_t max_sequence_length = MaximumSequenceLength(lod, level);
-    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
-                      "The first dimension of Tensor padding should be "
-                      "the maximum length of all sequences in LoDTensor seq.");
-
-    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
-    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
-                      "The second dimension of Tensor padding should be "
-                      "the number of sequences in LoDTensor seq.");
-
-    const int64_t sequence_width = seq.numel() / seq_dims[0];
-    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
-                      "The third dimension of Tensor padding should be the "
-                      "width of sequence in LoDTensor seq.");
-
-    if (!norm_by_times && num_sequences == 1UL) {
-      Copy(padding, context.GetPlace(), context, &seq);
-      seq.Resize(seq_dims);
-      return;
-    }
-
-    const int64_t kBlockSize = 512;
-
-    /* At least use 32 threads to copy sequence_width elements,
-     * and at least 8 elements for each thread.
-     */
-    size_t block_dim_x =
-        std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
-    size_t block_dim_y = kBlockSize / block_dim_x;
-    dim3 threads(block_dim_x, block_dim_y);
-
-    size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y;
-    size_t grid_dim_y = num_sequences;
-    dim3 grid(grid_dim_x, grid_dim_y);
-
-    const T* padding_data = padding.data<T>();
-    T* seq_data = seq.data<T>();
-    if (norm_by_times) {
-      SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
-          const_cast<T*>(padding_data), seq_data,
-          abs_offset_lod[level].cuda_data(), sequence_width,
-          max_sequence_length, num_sequences);
-    } else {
-      SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
-          const_cast<T*>(padding_data), seq_data,
-          abs_offset_lod[level].cuda_data(), sequence_width,
-          max_sequence_length, num_sequences);
-    }
-  }
-};
-
-template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
-template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/sequence_padding.h b/paddle/operators/math/sequence_padding.h
deleted file mode 100644
index 8f586c5eb469ea260dbdf9cc4e7f9b4b4a46a8cc..0000000000000000000000000000000000000000
--- a/paddle/operators/math/sequence_padding.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-inline static size_t MaximumSequenceLength(const framework::LoD& lod,
-                                           const size_t level) {
-  const size_t num_sequences = lod[level].size() - 1;
-  size_t max_sequence_length = 0;
-  framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-  for (size_t i = 0; i < num_sequences; ++i) {
-    max_sequence_length =
-        std::max(max_sequence_length,
-                 abs_offset_lod[level][i + 1] - abs_offset_lod[level][i]);
-  }
-  return max_sequence_length;
-}
-
-/*
- * \brief   Padding/Unpadding LoDTensor to/from normal Tensor of the shape
- *          [max_sequence_length, num_sequences, sequence_width].
- *
- *  Padding sequence:
- *        padding[i] = seq[lod[level][i]]
- *  Unpadding sequence:
- *        seq[lod[level][i]] = padding[i]
- *
- *  All sequences will be padded to the same length and stored in a transposed
- * shape.
- *  Example:
- *    seq     (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
- *    padding (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0)
- *
- * \param context       device context of this functor.
- * \param seq           LoDTensor which is stored in sequence format, the shape
- *                      is [total_sequence_length, sequence_width] where
- *                      total_sequence_length is the sum of all sequences'
- *                      length.
- * \param padding       Tensor which is padded to the same length, the shape is
- *                      [max_sequence_length, num_sequences, sequence_width].
- * \param norm_by_times whether dividing sequence's length.
- *
- * \note  transposition is also done in this functor.
- */
-template <typename DeviceContext, typename T>
-class PaddingLoDTensorFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::LoDTensor& seq,
-                  framework::Tensor& padding, bool norm_by_times);
-};
-
-template <typename DeviceContext, typename T>
-class UnpaddingLoDTensorFunctor {
- public:
-  void operator()(const DeviceContext& context, framework::LoDTensor& seq,
-                  const framework::Tensor& padding, bool norm_by_times);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/sequence_padding_test.cc b/paddle/operators/math/sequence_padding_test.cc
deleted file mode 100644
index 3e504f4a15c2cb4e2380f5ff8a39d83626dae062..0000000000000000000000000000000000000000
--- a/paddle/operators/math/sequence_padding_test.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/sequence_padding.h"
-#include <gtest/gtest.h>
-
-template <typename DeviceContext, typename Place, typename T>
-void TestSequencePadding(const paddle::framework::LoD& lod,
-                         const size_t sequence_width) {
-  paddle::framework::LoDTensor cpu_seq;
-  paddle::framework::LoDTensor cpu_seq_back;
-  paddle::framework::LoDTensor seq;
-  paddle::framework::LoDTensor seq_back;
-  paddle::framework::Tensor padding;
-
-  const size_t level = lod.size() - 1;
-  auto seq_dims =
-      paddle::framework::make_ddim({static_cast<int64_t>(lod[level].back()),
-                                    static_cast<int64_t>(sequence_width)});
-
-  cpu_seq.set_lod(lod);
-  cpu_seq.mutable_data<T>(seq_dims, paddle::platform::CPUPlace());
-  for (int64_t i = 0; i < cpu_seq.numel(); ++i) {
-    cpu_seq.data<T>()[i] = static_cast<T>(i);
-  }
-
-  auto* place = new Place();
-  DeviceContext* context = new DeviceContext(*place);
-  if (paddle::platform::is_cpu_place(*place)) {
-    seq = cpu_seq;
-  } else {
-    Copy(cpu_seq, *place, *context, &seq);
-    seq.set_lod(lod);
-  }
-
-  const size_t max_sequence_length =
-      paddle::operators::math::MaximumSequenceLength(lod, level);
-  const size_t num_sequences = lod[level].size() - 1;
-  auto padding_dims =
-      paddle::framework::make_ddim({static_cast<int64_t>(max_sequence_length),
-                                    static_cast<int64_t>(num_sequences),
-                                    static_cast<int64_t>(sequence_width)});
-  padding.mutable_data<T>(padding_dims, *place);
-  paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()(
-      *context, seq, padding, false);
-
-  seq_back.set_lod(lod);
-  seq_back.mutable_data<T>(seq_dims, *place);
-  paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
-      *context, seq_back, padding, false);
-
-  if (paddle::platform::is_cpu_place(*place)) {
-    cpu_seq_back = seq_back;
-  } else {
-    Copy(seq_back, paddle::platform::CPUPlace(), *context, &cpu_seq_back);
-    cpu_seq_back.set_lod(lod);
-  }
-
-  EXPECT_EQ(cpu_seq.numel(), cpu_seq_back.numel());
-  EXPECT_EQ(cpu_seq.dims(), cpu_seq_back.dims());
-  for (int64_t i = 0; i < cpu_seq.numel(); ++i) {
-    EXPECT_EQ(cpu_seq.data<T>()[i], cpu_seq_back.data<T>()[i]);
-  }
-
-  delete place;
-  delete context;
-};
-
-TEST(Seq2BatchPadding, CPU) {
-  paddle::framework::LoD lod1;
-  lod1.push_back(std::vector<size_t>{0, 10});
-  TestSequencePadding<paddle::platform::CPUDeviceContext,
-                      paddle::platform::CPUPlace, float>(lod1, 16);
-
-  paddle::framework::LoD lod2;
-  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
-  TestSequencePadding<paddle::platform::CPUDeviceContext,
-                      paddle::platform::CPUPlace, float>(lod2, 128);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(SequencePadding, CUDA) {
-  paddle::framework::LoD lod1;
-  lod1.push_back(std::vector<size_t>{0, 10});
-  TestSequencePadding<paddle::platform::CUDADeviceContext,
-                      paddle::platform::CUDAPlace, float>(lod1, 16);
-
-  paddle::framework::LoD lod2;
-  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
-  TestSequencePadding<paddle::platform::CUDADeviceContext,
-                      paddle::platform::CUDAPlace, float>(lod2, 128);
-}
-#endif
diff --git a/paddle/operators/math/sequence_pooling.cc b/paddle/operators/math/sequence_pooling.cc
deleted file mode 100644
index 8fb92b1a130b8f25163d856f3f596136072180cf..0000000000000000000000000000000000000000
--- a/paddle/operators/math/sequence_pooling.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/sequence_pooling.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-class MaxSeqPoolFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& input, framework::Tensor* output,
-                  framework::Tensor* index) {
-    auto in_dims = input.dims();
-    auto out_dims = output->dims();
-    auto idx_dims = index->dims();
-    PADDLE_ENFORCE_GT(in_dims.size(), 1);
-    PADDLE_ENFORCE_GT(out_dims.size(), 1);
-    for (int64_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
-    }
-    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
-
-    auto starts = input.lod()[0];
-    const T* in_data = input.data<T>();
-    T* out_data = output->data<T>();
-    int* max_index = index->data<int>();
-
-    int64_t num_seq = out_dims[0];
-    int64_t dim = output->numel() / num_seq;
-    for (int64_t i = 0; i < num_seq; ++i) {
-      for (int64_t k = 0; k < dim; ++k) {
-        out_data[i * dim + k] = in_data[starts[i] * dim + k];
-        max_index[i * dim + k] = starts[i];
-      }
-      for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
-        for (int64_t k = 0; k < dim; ++k) {
-          if (in_data[j * dim + k] > out_data[i * dim + k]) {
-            out_data[i * dim + k] = in_data[j * dim + k];
-            max_index[i * dim + k] = j;
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& out_grad,
-                  const framework::Tensor& index,
-                  framework::LoDTensor* in_grad) {
-    auto og_dims = out_grad.dims();
-    auto ig_dims = in_grad->dims();
-    auto idx_dims = index.dims();
-    PADDLE_ENFORCE_GT(og_dims.size(), 1);
-    PADDLE_ENFORCE_GT(ig_dims.size(), 1);
-    for (int64_t i = 1; i < og_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
-    }
-    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
-
-    const T* og_data = out_grad.data<T>();
-    const int* max_index = index.data<int>();
-    T* ig_data = in_grad->data<T>();
-
-    SetConstant<platform::CPUDeviceContext, T> set_zero;
-    set_zero(context, in_grad, static_cast<T>(0.0));
-    int64_t num_seq = og_dims[0];
-    int64_t dim = out_grad.numel() / num_seq;
-    for (int64_t i = 0; i < num_seq; ++i) {
-      for (int64_t j = 0; j < dim; ++j) {
-        int step_id = max_index[i * dim + j];
-        ig_data[step_id * dim + j] = og_data[i * dim + j];
-      }
-    }
-  }
-};
-
-template class MaxSeqPoolFunctor<platform::CPUDeviceContext, float>;
-template class MaxSeqPoolFunctor<platform::CPUDeviceContext, double>;
-template class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, float>;
-template class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/sequence_pooling.cu b/paddle/operators/math/sequence_pooling.cu
deleted file mode 100644
index f66534a6812a66c737445ea96914a393077d7d65..0000000000000000000000000000000000000000
--- a/paddle/operators/math/sequence_pooling.cu
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/sequence_pooling.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-#define FLT_MAX __FLT_MAX__
-
-template <typename T>
-__global__ void KeMaxSequencePool(const T* input, const size_t* starts,
-                                  T* output, int* index, int64_t num_seq,
-                                  int64_t dim) {
-  int dim_idx = threadIdx.x;
-  int seq_id = blockIdx.x;
-  if (seq_id >= num_seq) return;
-  size_t start = starts[seq_id];
-  size_t end = starts[seq_id + 1];
-
-  for (int64_t i = dim_idx; i < dim; i += blockDim.x) {
-    T max_val = static_cast<T>(-FLT_MAX);
-    int max_id = -1;
-    for (size_t step_id = start; step_id < end; step_id++) {
-      if (max_val < input[step_id * dim + i]) {
-        max_val = input[step_id * dim + i];
-        max_id = step_id;
-      }
-    }
-    output[seq_id * dim + i] = max_val;
-    index[seq_id * dim + i] = max_id;
-  }
-}
-
-template <typename T>
-class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::LoDTensor& input, framework::Tensor* output,
-                  framework::Tensor* index) {
-    auto in_dims = input.dims();
-    auto out_dims = output->dims();
-    auto idx_dims = index->dims();
-    PADDLE_ENFORCE_GT(in_dims.size(), static_cast<int64_t>(1));
-    PADDLE_ENFORCE_GT(out_dims.size(), 1);
-    for (int64_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
-    }
-    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
-
-    auto starts = input.lod()[0];
-    const T* in_data = input.data<T>();
-    T* out_data = output->data<T>();
-    int* max_index = index->data<int>();
-
-    int64_t num_seq = out_dims[0];
-    int64_t dim = output->numel() / num_seq;
-
-    dim3 threads(256, 1);
-    dim3 grid(num_seq, 1);
-    auto stream = context.stream();
-    KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
-        in_data, starts.cuda_data(), out_data, max_index, num_seq, dim);
-  }
-};
-
-template <typename T>
-__global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index,
-                                      T* in_grad, int64_t num_seq,
-                                      int64_t dim) {
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int col_idx = idx % dim;
-  if (idx < num_seq * dim) {
-    int step_id = max_index[idx];
-    in_grad[step_id * dim + col_idx] = out_grad[idx];
-  }
-}
-
-template <typename T>
-class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& out_grad,
-                  const framework::Tensor& index,
-                  framework::LoDTensor* in_grad) {
-    auto og_dims = out_grad.dims();
-    auto idx_dims = index.dims();
-    auto ig_dims = in_grad->dims();
-    PADDLE_ENFORCE_GT(og_dims.size(), static_cast<int64_t>(1));
-    PADDLE_ENFORCE_GT(ig_dims.size(), static_cast<int64_t>(1));
-    for (int64_t i = 1; i < og_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
-    }
-    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
-
-    const T* og_data = out_grad.data<T>();
-    const int* max_index = index.data<int>();
-    T* ig_data = in_grad->data<T>();
-
-    SetConstant<platform::CUDADeviceContext, T> set_zero;
-    set_zero(context, in_grad, static_cast<T>(0.0));
-    int64_t num_seq = og_dims[0];
-    int64_t dim = out_grad.numel() / num_seq;
-
-    unsigned int blocks = (num_seq * dim + 128 - 1) / 128;
-    dim3 threads(128, 1);
-    dim3 grid(blocks, 1);
-    auto stream = context.stream();
-    KeMaxSequencePoolGrad<T><<<grid, threads, 0, stream>>>(
-        og_data, max_index, ig_data, num_seq, dim);
-  }
-};
-
-template class MaxSeqPoolFunctor<platform::CUDADeviceContext, float>;
-template class MaxSeqPoolFunctor<platform::CUDADeviceContext, double>;
-template class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, float>;
-template class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/sequence_pooling.h b/paddle/operators/math/sequence_pooling.h
deleted file mode 100644
index 13ffb2ebef3a683b5e5fe64433a90237b944002e..0000000000000000000000000000000000000000
--- a/paddle/operators/math/sequence_pooling.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-#define FLT_MAX __FLT_MAX__
-
-template <typename DeviceContext, typename T>
-class MaxSeqPoolFunctor {
- public:
-  void operator()(const DeviceContext& context,
-                  const framework::LoDTensor& input, framework::Tensor* output,
-                  framework::Tensor* index);
-};
-
-template <typename DeviceContext, class T>
-class MaxSeqPoolGradFunctor {
- public:
-  void operator()(const DeviceContext& context,
-                  const framework::Tensor& out_grad,
-                  const framework::Tensor& index,
-                  framework::LoDTensor* in_grad);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/sequence_scale.cc b/paddle/operators/math/sequence_scale.cc
deleted file mode 100644
index 7e439e9a2cebaa5d494b185fd878e293a6895e45..0000000000000000000000000000000000000000
--- a/paddle/operators/math/sequence_scale.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/sequence_scale.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-class ScaleLoDTensorFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  framework::LoDTensor& seq, const T* scales) {
-    const size_t level = 0;
-    auto lod = seq.lod();
-    const size_t num_seq = lod[level].size() - 1;
-    size_t seq_width = seq.dims()[1];
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-
-    T* seq_data = seq.mutable_data<T>(context.GetPlace());
-    for (size_t i = 0; i < num_seq; ++i) {
-      for (size_t j = lod[level][i] * seq_width;
-           j < lod[level][i + 1] * seq_width; ++j) {
-        seq_data[j] *= scales[i];
-      }
-    }
-  }
-};
-
-template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, float>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/sequence_scale.cu b/paddle/operators/math/sequence_scale.cu
deleted file mode 100644
index fd4e28f6113729cd1fa9dc179bd9b601d29b8a7f..0000000000000000000000000000000000000000
--- a/paddle/operators/math/sequence_scale.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/sequence_scale.h"
-#include "paddle/platform/cuda_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <typename T, int BlockSize>
-__global__ void SequenceScaleKernel(T* seq, size_t* lod, const T* scales,
-                                    const size_t seq_width) {
-  for (int i = threadIdx.x;
-       i < (lod[blockIdx.x + 1] - lod[blockIdx.x]) * seq_width;
-       i += BlockSize) {
-    int idx = lod[blockIdx.x] * seq_width + i;
-    seq[idx] *= scales[blockIdx.x];
-  }
-}
-
-template <typename T>
-class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  framework::LoDTensor& seq, const T* scales) {
-    const size_t level = 0;
-    auto lod = seq.lod();
-    const size_t num_seq = lod[level].size() - 1;
-    const size_t seq_width = seq.numel() / seq.dims()[0];
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-    T* seq_data = seq.mutable_data<T>(context.GetPlace());
-
-    SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
-        num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
-        seq_data, abs_offset_lod[level].cuda_data(), scales, seq_width);
-  }
-};
-
-template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, float>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/sequence_scale.h b/paddle/operators/math/sequence_scale.h
deleted file mode 100644
index ecd9a57c3f4d8d91bfb8933a0fd38355c227744d..0000000000000000000000000000000000000000
--- a/paddle/operators/math/sequence_scale.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/*
- * \brief   Scale a sequence.
- *
- *  All sequences will be padded to the same length and stored in a transposed
- * shape.
- *  Example:
- *    Given:
- *      seq = (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
- *      scales = (2, 3, 4, 5)
- *    then:
- *      result = (2*s0, 2*s0, 2*s0, 2*s0; 3*s1, 3*s1; 4*s2, 4*s2, 4*s2; 5*s3)
-
- *
- * \param context       Device context of this functor.
- * \param seq           LoDTensor which is stored in sequence format, the shape
- *                      is [total_sequence_length, sequence_width] where
- *                      total_sequence_length is the sum of all sequences'
- *                      length.
- * \param scales        Array<T>. The i-th sequence will be scaled by scales[i].
- * \param num_seq       Number of sequence
- *
- */
-template <typename DeviceContext, typename T>
-class ScaleLoDTensorFunctor {
- public:
-  void operator()(const DeviceContext& context, framework::LoDTensor& seq,
-                  const T* scales);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/softmax.cc b/paddle/operators/math/softmax.cc
deleted file mode 100644
index 72f10f35f4ef39b41fbc5e900313eafd7ba669e9..0000000000000000000000000000000000000000
--- a/paddle/operators/math/softmax.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/softmax.h"
-#include "paddle/operators/math/softmax_impl.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template class SoftmaxFunctor<platform::CPUDeviceContext, float>;
-template class SoftmaxFunctor<platform::CPUDeviceContext, double>;
-template class SoftmaxGradFunctor<platform::CPUDeviceContext, float>;
-template class SoftmaxGradFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/softmax.cu b/paddle/operators/math/softmax.cu
deleted file mode 100644
index 9e73f6a371c950ed6f81ee90216f7fd3899f73ce..0000000000000000000000000000000000000000
--- a/paddle/operators/math/softmax.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-
-#include "paddle/operators/math/softmax.h"
-#include "paddle/operators/math/softmax_impl.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
-template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
-template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/softmax.h b/paddle/operators/math/softmax.h
deleted file mode 100644
index 471f44d340cfd0d6305a9127c34289ef1663accb..0000000000000000000000000000000000000000
--- a/paddle/operators/math/softmax.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename DeviceContext, typename T>
-class SoftmaxFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor* X,
-                  framework::Tensor* Y);
-};
-
-template <typename DeviceContext, typename T>
-class SoftmaxGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor* y,
-                  const framework::Tensor* y_grad, framework::Tensor* x_grad);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/softmax_impl.h b/paddle/operators/math/softmax_impl.h
deleted file mode 100644
index 82f597ff792decb1760f59e693026cd453432d05..0000000000000000000000000000000000000000
--- a/paddle/operators/math/softmax_impl.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-struct ValueClip {
-  HOSTDEVICE T operator()(const T& x) const {
-    const T kThreshold = -64.;
-    return x < kThreshold ? kThreshold : x;
-  }
-};
-
-template <typename DeviceContext, typename T>
-void SoftmaxFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
-                                                  const framework::Tensor* X,
-                                                  framework::Tensor* Y) {
-  auto logits = EigenMatrix<T>::From(*X);
-  auto softmax = EigenMatrix<T>::From(*Y);
-
-  const int kBatchDim = 0;
-  const int kClassDim = 1;
-
-  const int batch_size = logits.dimension(kBatchDim);
-  const int num_classes = logits.dimension(kClassDim);
-
-  Eigen::DSizes<int, 1> along_class(kClassDim);
-  Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-  Eigen::DSizes<int, 2> one_by_class(1, num_classes);
-
-  auto shifted_logits = (logits -
-                         logits.maximum(along_class)
-                             .eval()
-                             .reshape(batch_by_one)
-                             .broadcast(one_by_class))
-                            .unaryExpr(ValueClip<T>());
-
-  softmax.device(*context.eigen_device()) = shifted_logits.exp();
-  softmax.device(*context.eigen_device()) = (softmax *
-                                             softmax.sum(along_class)
-                                                 .inverse()
-                                                 .eval()
-                                                 .reshape(batch_by_one)
-                                                 .broadcast(one_by_class));
-}
-
-template <typename DeviceContext, typename T>
-void SoftmaxGradFunctor<DeviceContext, T>::operator()(
-    const DeviceContext& context, const framework::Tensor* y,
-    const framework::Tensor* y_grad, framework::Tensor* x_grad) {
-  auto softmax = EigenMatrix<T>::From(*y);
-  auto softmax_grad = EigenMatrix<T>::From(*y_grad);
-  auto logits_grad = EigenMatrix<T>::From(*x_grad);
-
-  const int kBatchDim = 0;
-  const int kClassDim = 1;
-
-  const int batch_size = softmax.dimension(kBatchDim);
-  const int num_classes = softmax.dimension(kClassDim);
-
-  Eigen::DSizes<int, 1> along_class(kClassDim);
-  Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-  Eigen::DSizes<int, 2> one_by_class(1, num_classes);
-
-  auto dot = (softmax * softmax_grad)
-                 .sum(along_class)
-                 .eval()
-                 .reshape(batch_by_one)
-                 .broadcast(one_by_class);
-  logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) * softmax;
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
deleted file mode 100644
index ecd3a647e00655a57d11c2f082bd1f81822cf92b..0000000000000000000000000000000000000000
--- a/paddle/operators/math/unpooling.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/unpooling.h"
-namespace paddle {
-namespace operators {
-namespace math {
-template <typename T>
-class Unpool2dMaxFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices, framework::Tensor* output) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
-    int input_feasize = input_height * input_width;
-    int output_feasize = output_height * output_width;
-    const T* input_data = input.data<T>();
-    const int* indices_data = indices.data<int>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    for (int b = 0; b < batch_size; ++b) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int i = 0; i < input_feasize; ++i) {
-          int index = indices_data[i];
-          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
-          output_data[index] = input_data[i];
-        }
-        input_data += input_feasize;
-        indices_data += input_feasize;
-        output_data += output_feasize;
-      }
-    }
-  }
-};
-template <class T>
-class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
-    int input_feasize = input_height * input_width;
-    int output_feasize = output_height * output_width;
-    const int* indices_data = indices.data<int>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-
-    for (int b = 0; b < batch_size; ++b) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int i = 0; i < input_feasize; ++i) {
-          int index = indices_data[i];
-          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
-          input_grad_data[i] = output_grad_data[index];
-        }
-        input_grad_data += input_feasize;
-        indices_data += input_feasize;
-        output_grad_data += output_feasize;
-      }
-    }
-  }
-};
-template class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, float>;
-template class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, double>;
-template class Unpool2dMaxFunctor<platform::CPUDeviceContext, float>;
-template class Unpool2dMaxFunctor<platform::CPUDeviceContext, double>;
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
deleted file mode 100644
index ecbde0f6a798ba817c28714b37af8187d2e9555e..0000000000000000000000000000000000000000
--- a/paddle/operators/math/unpooling.cu
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/unpooling.h"
-#include "paddle/platform/cuda_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-template <typename T>
-__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
-                                  const int* indices_data,
-                                  const int input_height, const int input_width,
-                                  const int channels, T* output_data,
-                                  const int output_height,
-                                  const int output_width) {
-  int in_n_stride = input_height * input_width * channels;
-  int in_c_stride = input_height * input_width;
-  int out_n_stride = output_height * output_width * channels;
-  int out_c_stride = output_height * output_width;
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    int bidx = i / in_n_stride;
-    int boffset = i % in_n_stride;
-    int cidx = boffset / in_c_stride;
-    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
-    int out_index = indices_data[i];
-    PADDLE_ASSERT(out_index < out_c_stride);
-    output_data[out_offset + out_index] = input_data[i];
-  }
-}
-template <typename T>
-__global__ void KernelUnpool2dMaxGrad(
-    const int nthreads, const T* input_data, const int* indices_data,
-    const int input_height, const int input_width, const int channels,
-    const T* output_data, const T* output_grad, const int output_height,
-    const int output_width, T* input_grad) {
-  int in_n_stride = input_height * input_width * channels;
-  int in_c_stride = input_height * input_width;
-  int out_n_stride = output_height * output_width * channels;
-  int out_c_stride = output_height * output_width;
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    int bidx = i / in_n_stride;
-    int boffset = i % in_n_stride;
-    int cidx = boffset / in_c_stride;
-    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
-    int out_index = indices_data[i];
-    PADDLE_ASSERT(out_index < out_c_stride);
-    input_grad[i] = output_grad[out_offset + out_index];
-  }
-}
-/*
- * All tensors are in NCHW format.
- */
-template <typename T>
-class Unpool2dMaxFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices, framework::Tensor* output) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
-    const T* input_data = input.data<T>();
-    const int* indices_data = indices.data<int>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int threads = 1024;
-    int grid = (input.numel() + threads - 1) / threads;
-    KernelUnpool2dMax<T><<<grid, threads, 0, context.stream()>>>(
-        input.numel(), input_data, indices_data, input_height, input_width,
-        output_channels, output_data, output_height, output_width);
-  }
-};
-/*
- * All tensors are in NCHW format.
- */
-template <typename T>
-class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  framework::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
-    const T* input_data = input.data<T>();
-    const int* indices_data = indices.data<int>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-    int threads = 1024;
-    int grid = (input.numel() + threads - 1) / threads;
-    KernelUnpool2dMaxGrad<T><<<grid, threads, 0, context.stream()>>>(
-        input.numel(), input_data, indices_data, input_height, input_width,
-        output_channels, output_data, output_grad_data, output_height,
-        output_width, input_grad_data);
-  }
-};
-template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, float>;
-template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, double>;
-template class Unpool2dMaxFunctor<platform::CUDADeviceContext, float>;
-template class Unpool2dMaxFunctor<platform::CUDADeviceContext, double>;
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h
deleted file mode 100644
index 0f0ff1371ebea8c7501aee1c7c45bc6a79de397e..0000000000000000000000000000000000000000
--- a/paddle/operators/math/unpooling.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-template <typename DeviceContext, typename T>
-class Unpool2dMaxFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& indices, framework::Tensor* output);
-};
-template <typename DeviceContext, class T>
-class Unpool2dMaxGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  framework::Tensor* input_grad);
-};
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/vol2col.cc b/paddle/operators/math/vol2col.cc
deleted file mode 100644
index d574ed9234304d992a6e4a10fce0816aee7fa40a..0000000000000000000000000000000000000000
--- a/paddle/operators/math/vol2col.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/vol2col.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/*
- * vol = [input_channels, input_depth, input_height, input_width]
- * col =
- *   [input_channels, filter_depth, filter_height, filter_width,
- *                    output_depth, output_height, output_width]
- */
-template <class T>
-class Vol2ColFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& vol,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* col) const {
-    PADDLE_ENFORCE(vol.dims().size() == 4);
-    PADDLE_ENFORCE(col->dims().size() == 7);
-
-    int input_channels = vol.dims()[0];
-    int input_depth = vol.dims()[1];
-    int input_height = vol.dims()[2];
-    int input_width = vol.dims()[3];
-    int filter_depth = col->dims()[1];
-    int filter_height = col->dims()[2];
-    int filter_width = col->dims()[3];
-    int output_depth = col->dims()[4];
-    int output_height = col->dims()[5];
-    int output_width = col->dims()[6];
-    int channels_col =
-        input_channels * filter_depth * filter_height * filter_width;
-
-    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
-
-    const T* vol_data = vol.data<T>();
-    T* col_data = col->data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int d_offset = (c / filter_width / filter_height) % filter_depth;
-      int c_in = c / filter_width / filter_height / filter_depth;
-      for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
-        for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
-          for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
-
-            int col_idx =
-                ((c * output_depth + d) * output_height + h) * output_width + w;
-            int vol_idx =
-                ((c_in * input_depth + d_pad) * input_height + h_pad) *
-                    input_width +
-                w_pad;
-            col_data[col_idx] =
-                (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
-                 w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
-                    ? static_cast<T>(0)
-                    : vol_data[vol_idx];
-          }
-        }
-      }
-    }
-  }
-};
-
-/*
- * vol = [input_channels,input_depth, input_height, input_width]
- * col =
- *   [input_channels, filter_depth, filter_height, filter_width,
- *                    output_depth, output_height, output_width]
- */
-template <class T>
-class Col2VolFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& col,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* vol) const {
-    PADDLE_ENFORCE(vol->dims().size() == 4);
-    PADDLE_ENFORCE(col.dims().size() == 7);
-
-    int input_channels = vol->dims()[0];
-    int input_depth = vol->dims()[1];
-    int input_height = vol->dims()[2];
-    int input_width = vol->dims()[3];
-    int filter_depth = col.dims()[1];
-    int filter_height = col.dims()[2];
-    int filter_width = col.dims()[3];
-    int output_depth = col.dims()[4];
-    int output_height = col.dims()[5];
-    int output_width = col.dims()[6];
-    int channels_col =
-        input_channels * filter_depth * filter_height * filter_width;
-
-    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
-    T* vol_data = vol->data<T>();
-    const T* col_data = col.data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int d_offset = (c / filter_width / filter_height) % filter_depth;
-      int cIm = c / filter_width / filter_height / filter_depth;
-      for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
-        for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
-          for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
-
-            if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
-                w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
-              int vol_idx =
-                  ((cIm * input_depth + d_pad) * input_height + h_pad) *
-                      input_width +
-                  w_pad;
-
-              int col_idx =
-                  ((c * output_depth + d) * output_height + h) * output_width +
-                  w;
-              vol_data[vol_idx] += col_data[col_idx];
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template class Vol2ColFunctor<platform::CPUDeviceContext, float>;
-template class Vol2ColFunctor<platform::CPUDeviceContext, double>;
-template class Col2VolFunctor<platform::CPUDeviceContext, float>;
-template class Col2VolFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/vol2col.cu b/paddle/operators/math/vol2col.cu
deleted file mode 100644
index b029442fe48dd27232d322aadec5864760e1b9ff..0000000000000000000000000000000000000000
--- a/paddle/operators/math/vol2col.cu
+++ /dev/null
@@ -1,262 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/vol2col.h"
-#include "paddle/platform/cuda_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <class T>
-__global__ void vol2col(int num_kernels, const T* data_vol, int depth,
-                        int height, int width, int dilation_d, int dilation_h,
-                        int dilation_w, int filter_depth, int filter_height,
-                        int filter_width, int stride_depth, int stride_height,
-                        int stride_width, int padding_depth, int padding_height,
-                        int padding_width, int output_detph, int output_height,
-                        int output_width, T* data_col) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
-       index += blockDim.x * gridDim.x) {
-    int w_out = index % output_width;
-    int h_out = (index / output_width) % output_height;
-    int d_out = (index / output_width / output_height) % output_detph;
-    int channel_in = index / output_width / output_height / output_detph;
-    int channel_out = channel_in * filter_depth * filter_height * filter_width;
-    int w_in = w_out * stride_width - padding_width;
-    int h_in = h_out * stride_height - padding_height;
-    int d_in = d_out * stride_depth - padding_depth;
-
-    data_col += ((channel_out * output_detph + d_out) * output_height + h_out) *
-                    output_width +
-                w_out;
-    data_vol += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
-    for (int k = 0; k < filter_depth; ++k) {
-      for (int i = 0; i < filter_height; ++i) {
-        for (int j = 0; j < filter_width; ++j) {
-          int d = d_in + k * dilation_d;
-          int h = h_in + i * dilation_h;
-          int w = w_in + j * dilation_w;
-          int col_idx = (k * dilation_d * height + i * dilation_h) * width +
-                        j * dilation_w;
-          *data_col = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
-                       w < width)
-                          ? data_vol[col_idx]
-                          : 0;
-          data_col += output_detph * output_height * output_width;
-        }
-      }
-    }
-  }
-}
-
-/*
- * im = [input_channels,intpu_depth, input_height, input_width]
- * col =
- *   [input_channels, filter_depth, filter_height, filter_width,
- *                    output_depth, output_height, output_width]
- */
-template <class T>
-class Vol2ColFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& vol,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* col) const {
-    PADDLE_ENFORCE(vol.dims().size() == 4);
-    PADDLE_ENFORCE(col->dims().size() == 7);
-
-    int input_channels = vol.dims()[0];
-    int input_depth = vol.dims()[1];
-    int input_height = vol.dims()[2];
-    int input_width = vol.dims()[3];
-    int filter_depth = col->dims()[1];
-    int filter_height = col->dims()[2];
-    int filter_width = col->dims()[3];
-    int output_depth = col->dims()[4];
-    int output_height = col->dims()[5];
-    int output_width = col->dims()[6];
-
-    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "Mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "Mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "Mismatching.");
-
-    int num_outputs =
-        input_channels * output_depth * output_height * output_width;
-
-    const int threads = 1024;
-    const int blocks = (num_outputs + 1024 - 1) / 1024;
-    vol2col<T><<<blocks, threads, 0, context.stream()>>>(
-        num_outputs, vol.data<T>(), input_depth, input_height, input_width,
-        dilations[0], dilations[1], dilations[2], filter_depth, filter_height,
-        filter_width, strides[0], strides[1], strides[2], paddings[0],
-        paddings[1], paddings[2], output_depth, output_height, output_width,
-        col->data<T>());
-  }
-};
-
-template <class T>
-__global__ void col2vol(int num_kernels, const T* data_col, int depth,
-                        int height, int width, int dilation_d, int dilation_h,
-                        int dilation_w, int filter_depth, int filter_height,
-                        int filter_width, int stride_depth, int stride_height,
-                        int stride_width, int padding_depth, int padding_height,
-                        int padding_width, int output_detph, int output_height,
-                        int output_width, T* data_vol) {
-  const int d_filter_depth = dilation_d * (filter_depth - 1) + 1;
-  const int d_filter_height = dilation_h * (filter_height - 1) + 1;
-  const int d_filter_width = dilation_w * (filter_width - 1) + 1;
-
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
-       index += blockDim.x * gridDim.x) {
-    T src_val = 0;
-    int w = index % width + padding_width;
-    int h = (index / width) % height + padding_height;
-    int d = (index / width / height) % depth + padding_depth;
-    int c = index / width / height / depth;
-
-    // compute the start and end of the output
-    int w_col_start =
-        (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1;
-    int w_col_end = min(w / stride_width + 1, output_width);
-    int h_col_start =
-        (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1;
-    int h_col_end = min(h / stride_height + 1, output_height);
-    int d_col_start =
-        (d < d_filter_depth) ? 0 : (d - d_filter_depth) / stride_depth + 1;
-    int d_col_end = min(d / stride_depth + 1, output_detph);
-
-    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          int d_off = (d - d_col * stride_depth);
-          int h_off = (h - h_col * stride_height);
-          int w_off = (w - w_col * stride_width);
-          if (d_off % dilation_d == 0 && h_off % dilation_h == 0 &&
-              w_off % dilation_w == 0) {
-            d_off /= dilation_d;
-            h_off /= dilation_h;
-            w_off /= dilation_w;
-
-            int data_col_index =
-                (((((c * filter_depth + d_off) * filter_height + h_off) *
-                       filter_width +
-                   w_off)));
-            data_col_index =
-                ((data_col_index * output_detph + d_col) * output_height +
-                 h_col) *
-                    output_width +
-                w_col;
-            src_val += data_col[data_col_index];
-          }
-        }
-      }
-    }
-    data_vol[index] = src_val;
-  }
-}
-
-/*
- * im = [input_channels, input_depth, input_height, input_width]
- * col =
- *   [input_channels, filter_depth, filter_height, filter_width,
- *                    output_depth, output_height, output_width]
- */
-template <class T>
-class Col2VolFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& col,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* vol) const {
-    PADDLE_ENFORCE(vol->dims().size() == 4);
-    PADDLE_ENFORCE(col.dims().size() == 7);
-
-    int input_channels = vol->dims()[0];
-    int input_depth = vol->dims()[1];
-    int input_height = vol->dims()[2];
-    int input_width = vol->dims()[3];
-    int filter_depth = col.dims()[1];
-    int filter_height = col.dims()[2];
-    int filter_width = col.dims()[3];
-    int output_depth = col.dims()[4];
-    int output_height = col.dims()[5];
-    int output_width = col.dims()[6];
-
-    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "Mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "Mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "Mismatching.");
-
-    int num_kernels = input_channels * input_depth * input_height * input_width;
-
-    const int threads = 1024;
-    const int blocks = (num_kernels + 1024 - 1) / 1024;
-
-    col2vol<T><<<blocks, threads, 0, context.stream()>>>(
-        num_kernels, col.data<T>(), input_depth, input_height, input_width,
-        dilations[0], dilations[1], dilations[2], filter_depth, filter_height,
-        filter_width, strides[0], strides[1], strides[2], paddings[0],
-        paddings[1], paddings[2], output_depth, output_height, output_width,
-        vol->data<T>());
-  }
-};
-
-template class Vol2ColFunctor<platform::CUDADeviceContext, float>;
-template class Vol2ColFunctor<platform::CUDADeviceContext, double>;
-template class Col2VolFunctor<platform::CUDADeviceContext, float>;
-template class Col2VolFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/vol2col.h b/paddle/operators/math/vol2col.h
deleted file mode 100644
index dcd80370e8516d34b764b1ab3b0b98516e738bf6..0000000000000000000000000000000000000000
--- a/paddle/operators/math/vol2col.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/tensor.h"
-#include "paddle/framework/tensor_util.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-/*
- * \brief Converts the feature data of four dimensions(CDHW) into a colData of
- *        seven dimensions in the Vol2ColFunctor calculation,
- *        And in the Col2VolFunctor calculation, it is reversed.
- *
- * \param volData   Vol data.
- * \param volShape  The shape of volData,
- *                 [input_channels, input_depth, input_height, input_width].
- * \param colData  Column data.
- * \param colShape The shape of colData.
- *
- * \param dilations    dilation data.
- * \param 3-dimension  [dilation_depth, dilation_height, dilation_width].
- *
- * \param strides      stride data.
- * \param 3-dimension  [stride_depth, stride_height, stride_width].
- *
- * \param paddings     padding data.
- * \param 3-dimension  [d_pad, h_pad, w_pad].
- *
- * The shape of colData is:
- * [input_channels, filter_depth, filter_height, filter_width, output_depth,
- * output_height, output_width]
- * So, it is easy to reshape into a convolution matrix for convolution
- * calculation based on matrix multiplication.
- * The shape of convolution matrix is [height, width], where the height is equal
- * input_channels * filter_depth * filter_height * filter_width, and the width
- * is equal output_depth * output_height * output_width.
- *
- * Reshape:
- *     shape of colData           shape of convolution matrix
- *     [input_channels,
- *      filter_depth,
- *      filter_height,
- *      filter_width,      ======>      [height, width]
- *      output_depth,
- *      output_height,
- *      output_width]
- *
- * \note The caller needs to ensure that volShape.inputChannels is equal to
- *       colShape.inputChannels.
- */
-template <typename DeviceContext, typename T>
-class Vol2ColFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& vol,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* col) const;
-};
-
-template <typename DeviceContext, typename T>
-class Col2VolFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& col,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* vol) const;
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc
deleted file mode 100644
index 7a308ca81403fb4ac65a7463102a91c050ab4561..0000000000000000000000000000000000000000
--- a/paddle/operators/math/vol2col_test.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/vol2col.h"
-#include <gtest/gtest.h>
-#include <iostream>
-
-template <typename DeviceContext, typename Place>
-void testVol2col() {
-  paddle::framework::Tensor input;
-  paddle::framework::Tensor input_tmp;
-  paddle::framework::Tensor output;
-  paddle::framework::Tensor output_tmp;
-
-  auto* place = new Place();
-  DeviceContext* context = new DeviceContext(*place);
-
-  /**
-   * input = [[0, 1, 2,
-   *          3, 4, 5]
-   *          [6, 7, 8,
-   *          9, 10, 11]]
-   *
-   * output = [0, 1
-   *           1, 2
-   *           3, 4
-   *           4, 5
-   *           6, 7
-   *           7, 8
-   *           9, 10
-   *           10, 11]
-   *
-   * col2vol = [[0, 2, 2,
-   *             3, 8, 5]
-   *            [6, 14, 8,
-   *             9, 20, 11]]
-   *
-   */
-  int input_depth = 2;
-  int input_height = 2;
-  int input_width = 3;
-  int filter_size = 2;
-  std::vector<int> strides({1, 1, 1});
-  std::vector<int> paddings({0, 0, 0});
-  std::vector<int> dilations({1, 1, 1});
-  int output_depth =
-      (input_depth - filter_size + 2 * paddings[0]) / strides[0] + 1;
-  int output_height =
-      (input_height - filter_size + 2 * paddings[1]) / strides[1] + 1;
-  int output_width =
-      (input_width - filter_size + 2 * paddings[2]) / strides[2] + 1;
-
-  // Vol2Col test
-  float* input_ptr =
-      input_tmp.mutable_data<float>({1, input_depth, input_height, input_width},
-                                    paddle::platform::CPUPlace());
-  float arr[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  memcpy(input_ptr, arr, 12 * sizeof(float));
-
-  if (paddle::platform::is_cpu_place(*place)) {
-    input = input_tmp;
-  } else {
-    Copy(input_tmp, *place, *context, &input);
-  }
-  output.mutable_data<float>({1, filter_size, filter_size, filter_size,
-                              output_depth, output_height, output_width},
-                             *place);
-
-  paddle::operators::math::Vol2ColFunctor<DeviceContext, float> vol2col;
-  vol2col(*context, input, dilations, strides, paddings, &output);
-
-  float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
-  float* out_cfo_ptr;
-  if (paddle::platform::is_cpu_place(*place)) {
-    out_cfo_ptr = output.data<float>();
-  } else {
-    Copy(output, paddle::platform::CPUPlace(), *context, &output_tmp);
-    out_cfo_ptr = output_tmp.data<float>();
-  }
-
-  for (int i = 0; i < 16; ++i) {
-    EXPECT_EQ(out_cfo_ptr[i], vol_2_col[i]);
-  }
-
-  // Col2Vol test
-  float col_2_vol[] = {0, 2, 2, 3, 8, 5, 6, 14, 8, 9, 20, 11};
-  memset(input_ptr, 0, 12 * sizeof(float));
-  if (paddle::platform::is_cpu_place(*place)) {
-    input = input_tmp;
-  } else {
-    Copy(input_tmp, *place, *context, &input);
-  }
-
-  paddle::operators::math::Col2VolFunctor<DeviceContext, float> col2vol;
-  col2vol(*context, output, dilations, strides, paddings, &input);
-
-  float* in_ptr;
-  if (paddle::platform::is_cpu_place(*place)) {
-    in_ptr = input.data<float>();
-  } else {
-    Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
-    in_ptr = input_tmp.data<float>();
-  }
-
-  for (int i = 0; i < 12; ++i) {
-    EXPECT_EQ(in_ptr[i], col_2_vol[i]);
-  }
-}
-
-TEST(math, vol2col) {
-  testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
-#ifdef PADDLE_WITH_CUDA
-  testVol2col<paddle::platform::CUDADeviceContext,
-              paddle::platform::CUDAPlace>();
-#endif  // PADDLE_WITH_CUDA
-}
diff --git a/paddle/operators/matmul_op.cc b/paddle/operators/matmul_op.cc
deleted file mode 100644
index 3336978c8d8d94c69b986970274661a01ad2161d..0000000000000000000000000000000000000000
--- a/paddle/operators/matmul_op.cc
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/matmul_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class MatMulOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"),
-                   "Input(X) of MatMulOp should not be null.");
-    PADDLE_ENFORCE(context->HasInput("Y"),
-                   "Input(Y) of MatMulOp should not be null.");
-    PADDLE_ENFORCE(context->HasOutput("Out"),
-                   "Output(Out) of MatMulOp should not be null.");
-
-    auto dim_x = context->GetInputDim("X");
-    auto dim_y = context->GetInputDim("Y");
-    bool transpose_x = context->Attrs().Get<bool>("transpose_X");
-    bool transpose_y = context->Attrs().Get<bool>("transpose_Y");
-
-    PADDLE_ENFORCE_GE(dim_x.size(), 1,
-                      "Input tensor X must be at least 1-dimensional.");
-    PADDLE_ENFORCE_GE(dim_y.size(), 1,
-                      "Input tensor Y must be at least 1-dimensional.");
-
-    std::vector<int64_t> out_dim;
-    int64_t batch_count = 1;
-    if (dim_x.size() > 3) {
-      PADDLE_ENFORCE_EQ(
-          dim_y.size(), dim_x.size(),
-          "The dimensions of X and Y must be the same, and both of "
-          "them should be %d-dimensional.",
-          dim_x.size());
-
-      // The first rank-2 dimensions are accumulated on the batch_count, and the
-      // last two dimensions are used for matrix multiplication.
-      for (int j = 0; j < dim_x.size() - 2; ++j) {
-        PADDLE_ENFORCE_EQ(dim_y[j], dim_x[j],
-                          "The %d-th dimension of X and Y must be the same.",
-                          j);
-        out_dim.push_back(dim_x[j]);
-        batch_count *= dim_x[j];
-      }
-    }
-
-    int M = 0, N = 0, KX = 0, KY = 0, batchCountX = 0, batchCountY = 0;
-    bool remove_initial_dim = false, remove_final_dim = false;
-
-    switch (dim_x.size()) {
-      case 1:
-        if (transpose_x) {
-          M = dim_x[0];
-          KX = 1;
-        } else {
-          M = 1;
-          KX = dim_x[0];
-          remove_initial_dim = true;
-        }
-        break;
-      case 2:
-        M = transpose_x ? dim_x[1] : dim_x[0];
-        KX = transpose_x ? dim_x[0] : dim_x[1];
-        break;
-      case 3:
-        batchCountX = dim_x[0];
-        M = transpose_x ? dim_x[2] : dim_x[1];
-        KX = transpose_x ? dim_x[1] : dim_x[2];
-        break;
-      default:
-        batchCountX = batch_count;
-        size_t mat_s = dim_x.size() - 2;
-        M = transpose_x ? dim_x[mat_s + 1] : dim_x[mat_s];
-        KX = transpose_x ? dim_x[mat_s] : dim_x[mat_s + 1];
-        break;
-    }
-
-    switch (dim_y.size()) {
-      case 1:
-        if (transpose_y) {
-          N = dim_y[0];
-          KY = 1;
-        } else {
-          N = 1;
-          KY = dim_y[0];
-          remove_final_dim = true;
-        }
-        break;
-      case 2:
-        KY = transpose_y ? dim_y[1] : dim_y[0];
-        N = transpose_y ? dim_y[0] : dim_y[1];
-        break;
-      case 3:
-        batchCountY = dim_y[0];
-        KY = transpose_y ? dim_y[2] : dim_y[1];
-        N = transpose_y ? dim_y[1] : dim_y[2];
-        break;
-      default:
-        batchCountY = batch_count;
-        size_t mat_s = dim_y.size() - 2;
-        KY = transpose_y ? dim_y[mat_s + 1] : dim_y[mat_s];
-        N = transpose_y ? dim_y[mat_s] : dim_y[mat_s + 1];
-    }
-
-    PADDLE_ENFORCE_EQ(
-        KX, KY,
-        "First matrix's width must be equal with second matrix's height.");
-    if (batchCountX && batchCountY) {
-      PADDLE_ENFORCE_EQ(
-          batchCountX, batchCountY,
-          "When Input(X) and Input(Y) are both three dimensional, they "
-          "must have the same batch dimension.");
-    }
-    int batchCount = std::max(batchCountX, batchCountY);
-
-    std::vector<int64_t> dim_out;
-    if (batchCount) {
-      if (dim_x.size() > 3) {
-        dim_out.insert(dim_out.begin(), out_dim.begin(), out_dim.end());
-      } else {
-        dim_out.push_back(batchCount);
-      }
-    }
-    if (!remove_initial_dim) {
-      dim_out.push_back(M);
-    }
-    if (!remove_final_dim) {
-      dim_out.push_back(N);
-    }
-    if (dim_out.size() == 0) {
-      // We don't support 0-dimensional Tensors (scalars), so instead
-      // treat the output as a Tensor of shape (1, ) in this case.
-      dim_out.push_back(1);
-    }
-    context->SetOutputDim("Out", framework::make_ddim(dim_out));
-    context->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  MatMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of MatMul op");
-    AddInput("Y", "The second input of MatMul op");
-    AddOutput("Out", "The output of MatMul op");
-    AddAttr<bool>("transpose_X",
-                  R"DOC(If true, use the transpose of `X`.
-        )DOC")
-        .SetDefault(false);
-    AddAttr<bool>("transpose_Y",
-                  R"DOC(If true, use the transpose of `Y`.
-        )DOC")
-        .SetDefault(false);
-    AddComment(R"DOC(
-MatMul Operator.
-
-
-This operator is used to perform (batched) matrix multiplication
-over the last two dimensions of the input tensors `X` and `Y`.
-
-If a transpose flag is specified, the last two dimensions of the
-tensor are transposed. If the tensor is rank-1 of shape [D], then
-for `X` it is treated as [1, D] in nontransposed form and as [D, 1]
-in transposed form, whereas for `Y` it is the opposite: It is treated
-as [D, 1] in nontransposed form and as [1, D] in transposed form.
-
-Examples without transpose:
-- X: [K], Y: [K] => Out: [1]
-- X: [K], Y: [K, N] => Out: [N]
-- X: [B, M, K], Y: [K] => Out: [B, M]
-- X: [M, K], Y: [B, K, N] => Out: [B, M, N]
-- X: [B, M, K], Y: [B, K, N] => Out: [B, M, N]
-- X: [B, ..., M, K], Y: [B, ..., K, N] => Out: [B, ..., M, N]
-
-The behavior is designed to be similar to the `numpy.matmul` function.
-The differences are:
-- When the rank of the input data is less than or equal to 3, it
-  is similar to the `numpy.matmul` function.
-- When the rank of the input is greater than 3, the rank of X and
-  Y must be equal, and the first `rank - 2` dimensions must be equal.
-- We add `transpose_X` and `transpose_Y` flags.
-
-Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input `X`.
-
-)DOC");
-  }
-};
-
-class MatMulOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(context->HasInput("Y"), "Input(Y) should not be null");
-    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto x_dims = context->GetInputDim("X");
-    auto y_dims = context->GetInputDim("Y");
-
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-
-    if (context->HasOutput(x_grad_name)) {
-      context->SetOutputDim(x_grad_name, x_dims);
-    }
-    if (context->HasOutput(y_grad_name)) {
-      context->SetOutputDim(y_grad_name, y_dims);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(matmul, ops::MatMulOp, ops::MatMulOpMaker, matmul_grad,
-            ops::MatMulOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    matmul, ops::MatMulKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    matmul_grad,
-    ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/matmul_op.cu.cc b/paddle/operators/matmul_op.cu.cc
deleted file mode 100644
index d28d12164e493786c5bdafb157795d797ee87b91..0000000000000000000000000000000000000000
--- a/paddle/operators/matmul_op.cu.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/matmul_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    matmul, ops::MatMulKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    matmul_grad,
-    ops::MatMulGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/matmul_op.h b/paddle/operators/matmul_op.h
deleted file mode 100644
index fe6a97465f8992281c909d4600bcfd8121d6a64a..0000000000000000000000000000000000000000
--- a/paddle/operators/matmul_op.h
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/matmul.h"
-
-namespace paddle {
-namespace operators {
-namespace matmul_detail {
-
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-using framework::make_ddim;
-using framework::vectorize;
-
-template <typename DeviceContext, typename T>
-class MatMulKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor& x = *context.Input<Tensor>("X");
-    const Tensor& y = *context.Input<Tensor>("Y");
-    Tensor* out = context.Output<Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-    bool transpose_x = context.Attr<bool>("transpose_X");
-    bool transpose_y = context.Attr<bool>("transpose_Y");
-
-    math::MatMulFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), x, transpose_x, y,
-        transpose_y, T(1), out, T(0));
-  }
-};
-
-template <typename T>
-inline Tensor Reshape(const Tensor& input, const DDim& dims) {
-  Tensor output;
-  output.ShareDataWith(input);
-  output.Resize(dims);
-  return output;
-}
-
-// Reshape a rank-3 tensor from P x M x N to (P * M) x N.
-// Identity op if the tensor is not of rank 3.
-template <typename T>
-Tensor CombineBatchAndM(const Tensor& input) {
-  Tensor output;
-  output.ShareDataWith(input);
-  auto in_dims = input.dims();
-  if (in_dims.size() == 3) {
-    std::vector<int64_t> out_dims = {in_dims[0] * in_dims[1], in_dims[2]};
-    output.Resize(make_ddim(out_dims));
-  }
-  return output;
-}
-
-// Reshape a rank-3 tensor from P x M x N to M x (P * N).
-// (Warning: This requires transposing data and writes into new memory.)
-// Identity op if the tensor is not of rank 3.
-template <typename DeviceContext, typename T>
-Tensor CombineBatchAndN(const DeviceContext& context, const Tensor& input) {
-  Tensor output;
-  auto in_dims = input.dims();
-  if (in_dims.size() == 3) {
-    output.Resize({in_dims[1], in_dims[0], in_dims[2]});
-    output.mutable_data<T>(context.GetPlace());
-    std::vector<int> axis = {1, 0, 2};
-    math::Transpose<DeviceContext, T, 3> trans;
-    trans(context, input, &output, axis);
-    std::vector<int64_t> out_dims = {in_dims[1], in_dims[0] * in_dims[2]};
-    output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
-  } else {
-    output.ShareDataWith(input);
-  }
-  return output;
-}
-
-// Using dimensional constraints on matrix multiplication, it is
-// straight-forward to check the following table for when X and Y
-// are both matrices.
-//
-// transpose_X | False    | True     | False    | True
-// transpose_Y | False    | False    | True     | True
-// -----------+----------+----------+----------+-----------
-//        dX = | dOut Y^T | Y dOut^T | dOut Y   | Y^T dOut^T
-//        dY = | X^T dOut | X dOut   | dOut^T X | dOut^T X^T
-//
-// When X is a vector of size K, we treat it instead as a matrix of shape
-// (1, K). Similarly, when Y is a vector of size K, we treat it instead as
-// a matrix of shape (K, 1).
-//
-// When X and Y are both 3-dimensional tensors, then the first dimension
-// the batch dimension can be ignored and the exact same formulas apply
-// as for two matrices.
-//
-// Finally, when, e.g., X is a 3-dimensional tensor but Y is a matrix, we end
-// up with formulas like
-//
-//   dY_{ij} = \sum_{p, m} X_{pmi} dOut_{pmj}
-//
-// To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N
-// to X: (P * M) x K, dOut: (P * M) x N.
-template <typename DeviceContext, typename T>
-class MatMulGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor& x = *context.Input<Tensor>("X");
-    const Tensor& y = *context.Input<Tensor>("Y");
-    const Tensor& dout = *context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* dx = context.Output<Tensor>(framework::GradVarName("X"));
-    Tensor* dy = context.Output<Tensor>(framework::GradVarName("Y"));
-    bool transpose_x = context.Attr<bool>("transpose_X");
-    bool transpose_y = context.Attr<bool>("transpose_Y");
-
-    std::vector<int64_t> x_dims = vectorize(x.dims());
-    std::vector<int64_t> y_dims = vectorize(y.dims());
-
-    // If X is a vector, reshape it to a matrix.
-    if (x_dims.size() == 1) {
-      x_dims.insert(x_dims.begin(), 1);
-    }
-
-    // If Y is a vector, reshape it to a matrix.
-    if (y_dims.size() == 1) {
-      y_dims.push_back(1);
-    }
-
-    int batch_count = 0;
-    // The first rank-2 dimensions are accumulated on the batch_count, and the
-    // last two dimensions are used for matrix multiplication.
-    if (x_dims.size() > 3) {
-      batch_count = accumulate(x_dims.begin(), x_dims.end() - 2, 1,
-                               std::multiplies<int>());
-    }
-    // Fix the dOut dimensions.
-    int M = 0, N = 0, batchCountX = 0, batchCountY = 0;
-
-    switch (x_dims.size()) {
-      case 2:
-        M = transpose_x ? x_dims[1] : x_dims[0];
-        break;
-      case 3:
-        batchCountX = x_dims[0];
-        M = transpose_x ? x_dims[2] : x_dims[1];
-        break;
-      default:
-        batchCountX = batch_count;
-        size_t mat_s = x_dims.size() - 2;
-        M = transpose_x ? x_dims[mat_s + 1] : x_dims[mat_s];
-    }
-
-    switch (y_dims.size()) {
-      case 2:
-        N = transpose_y ? y_dims[0] : y_dims[1];
-        break;
-      case 3:
-        batchCountY = y_dims[0];
-        N = transpose_y ? y_dims[1] : y_dims[2];
-        break;
-      default:
-        batchCountY = batch_count;
-        size_t mat_s = y_dims.size() - 2;
-        N = transpose_y ? y_dims[mat_s] : y_dims[mat_s + 1];
-    }
-    if (batchCountX && batchCountY) {
-      PADDLE_ENFORCE_EQ(
-          batchCountX, batchCountY,
-          "When Input(X) and Input(Y) are both three dimensional, they "
-          "must have the same batch dimension.");
-    }
-    int batchCount = std::max(batchCountX, batchCountY);
-    std::vector<int64_t> dout_dims = {M, N};
-    if (batchCount) {
-      if (x_dims.size() > 3) {
-        dout_dims.insert(dout_dims.begin(), x_dims.begin(), x_dims.end() - 2);
-      } else {
-        dout_dims.insert(dout_dims.begin(), batchCount);
-      }
-    }
-    Tensor X = Reshape<T>(x, make_ddim(x_dims));
-    Tensor Y = Reshape<T>(y, make_ddim(y_dims));
-    Tensor dOut = Reshape<T>(dout, make_ddim(dout_dims));
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    if (dx) {
-      dx->mutable_data<T>(context.GetPlace());
-      const Tensor& dOut_for_dX =
-          (x_dims.size() == 2 && y_dims.size() == 3)
-              ? CombineBatchAndN<DeviceContext, T>(dev_ctx, dOut)
-              : dOut;
-      if (x_dims.size() == 2 && y_dims.size() == 3) {
-        Y = transpose_y ? CombineBatchAndM<T>(Y)
-                        : CombineBatchAndN<DeviceContext, T>(dev_ctx, Y);
-      }
-      if (transpose_x) {
-        math::MatMulFunctor<DeviceContext, T>()(
-            dev_ctx, Y, transpose_y, dOut_for_dX, transpose_x, T(1), dx, T(0));
-      } else {
-        math::MatMulFunctor<DeviceContext, T>()(
-            dev_ctx, dOut_for_dX, transpose_x, Y, !transpose_y, T(1), dx, T(0));
-      }
-    }
-
-    if (dy) {
-      dy->mutable_data<T>(context.GetPlace());
-      const Tensor& dOut_for_dY = (y_dims.size() == 2 && x_dims.size() == 3)
-                                      ? CombineBatchAndM<T>(dOut)
-                                      : dOut;
-      if (y_dims.size() == 2 && x_dims.size() == 3) {
-        X = transpose_x ? CombineBatchAndN<DeviceContext, T>(dev_ctx, X)
-                        : CombineBatchAndM<T>(X);
-        dOut = CombineBatchAndM<T>(dOut);
-      }
-      if (transpose_y) {
-        math::MatMulFunctor<DeviceContext, T>()(
-            dev_ctx, dOut_for_dY, transpose_y, X, transpose_x, T(1), dy, T(0));
-      } else {
-        math::MatMulFunctor<DeviceContext, T>()(
-            dev_ctx, X, !transpose_x, dOut_for_dY, transpose_y, T(1), dy, T(0));
-      }
-    }
-  }
-};
-}  // namespace matmul_detail
-
-using matmul_detail::MatMulKernel;
-using matmul_detail::MatMulGradKernel;
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/max_sequence_len_op.cc b/paddle/operators/max_sequence_len_op.cc
deleted file mode 100644
index 019150e4914e8bd34a5e8b7d37318aee43942fcc..0000000000000000000000000000000000000000
--- a/paddle/operators/max_sequence_len_op.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-class MaxSeqenceLenOp : public framework::OperatorBase {
- public:
-  MaxSeqenceLenOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
-    auto &rank_table =
-        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
-    auto *out =
-        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
-    int64_t *out_ptr = out->mutable_data<int64_t>({1}, platform::CPUPlace());
-    *out_ptr = rank_table.items()[0].length;
-  }
-};
-
-class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  MaxSeqenceLenOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("RankTable", "The lod_rank_table.");
-    AddOutput("Out", "The max sequence length.");
-    AddComment(
-        R"DOC(Calculate the max sequence length through lod_rank_table.)DOC");
-  }
-};
-
-class MaxSeqenceLenInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("RankTable"));
-    context->SetOutputDim("Out", {1});
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(max_sequence_len, paddle::operators::MaxSeqenceLenOp,
-                  paddle::operators::MaxSeqenceLenOpProtoMaker,
-                  paddle::operators::MaxSeqenceLenInferShape,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/maxout_op.cc b/paddle/operators/maxout_op.cc
deleted file mode 100644
index 3ee32269417e80cd14a6ff0f8e52c0b2dec4b8be..0000000000000000000000000000000000000000
--- a/paddle/operators/maxout_op.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- *     Unless required by applicable law or agreed to in writing, software
- *     distributed under the License is distributed on an "AS IS" BASIS,
- *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *     See the License for the specific language governing permissions and
- *     limitations under the License. */
-
-#include "paddle/operators/maxout_op.h"
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  MaxOutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor of maxout operator. "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
-        "number of channels, H and W is the height and width of feature.");
-    AddOutput("Out",
-              "(Tensor) The output tensor of maxout operator."
-              "The format of output tensor is also NCHW."
-              "Where N is batch size, C is "
-              "the number of channels, H and W is the height and "
-              "width of feature.");
-    AddAttr<int>(
-        "groups",
-        R"DOC("Specifies how many groups the input tensor will be split"
-        "in the channel dimension. And the number of output channel is "
-        "the number of channels divided by groups.."
-        )DOC");
-    AddComment(R"DOC(
-MaxOut Operator.
-
-Assumed the input shape is (N, Ci, H, W).
-The output shape is (N, Co, H, W).
-Then $Co = Ci / groups$ and the operator formula is as follows:
-
-$$
-y_{si+j} = \max_k x_{gsi + sk + j} \\
-g = groups \\
-s = \frac{input.size}{num\_channels} \\
-0 \le i < \frac{num\_channels}{groups} \\
-0 \le j < s \\
-0 \le k < groups
-$$
-
-Please refer to Paper:
-  - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
-  - Multi-digit Number Recognition from Street View \
-    Imagery using Deep Convolutional Neural Networks: \
-    https://arxiv.org/pdf/1312.6082v4.pdf
-
-)DOC");
-  }
-};
-
-class MaxOutOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of MaxoutOp"
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of MaxoutOp should not be null.");
-    auto in_x_dims = ctx->GetInputDim("X");
-    int groups = ctx->Attrs().Get<int>("groups");
-    // check groups > 1
-    PADDLE_ENFORCE_GT(groups, 1, "groups should be larger than 1 in maxoutop");
-    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1] / groups});
-    output_shape.push_back(in_x_dims[2]);
-    output_shape.push_back(in_x_dims[3]);
-    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-  }
-};
-
-class MaxOutOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Input(X@GRAD) should not be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
-            ops::MaxOutOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    maxout, ops::MaxOutKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    maxout_grad,
-    ops::MaxOutGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/maxout_op.cu.cc b/paddle/operators/maxout_op.cu.cc
deleted file mode 100644
index c4a2d676d3aca4d59d0bfa8c75aa0c249e202ab5..0000000000000000000000000000000000000000
--- a/paddle/operators/maxout_op.cu.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/maxout_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    maxout, ops::MaxOutKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MaxOutKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    maxout_grad,
-    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/maxout_op.h b/paddle/operators/maxout_op.h
deleted file mode 100644
index e8b12552b9ff39e23702de17abc9825a527f02aa..0000000000000000000000000000000000000000
--- a/paddle/operators/maxout_op.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/maxouting.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class MaxOutKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-    int groups = context.template Attr<int>("groups");
-
-    math::MaxOutFunctor<DeviceContext, T> maxout_forward;
-    maxout_forward(context.template device_context<DeviceContext>(), *in_x, out,
-                   groups);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MaxOutGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    const Tensor* out = context.Input<Tensor>("Out");
-    const Tensor* out_grad =
-        context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    int groups = context.template Attr<int>("groups");
-    auto& device_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
-    if (in_x_grad) {
-      in_x_grad->mutable_data<T>(context.GetPlace());
-      zero(device_ctx, in_x_grad, static_cast<T>(0.0));
-      math::MaxOutGradFunctor<DeviceContext, T> maxout_backward;
-      maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
deleted file mode 100644
index 411f4d14efbfa5a8ee6dd7da645a044b191bf006..0000000000000000000000000000000000000000
--- a/paddle/operators/mean_op.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/mean_op.h"
-
-namespace paddle {
-namespace operators {
-
-class MeanOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of MeanOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of MeanOp should not be null.");
-    ctx->SetOutputDim("Out", {1});
-  }
-};
-
-class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  MeanOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input of mean op");
-    AddOutput("Out", "The output of mean op");
-    AddComment(R"DOC(
-Mean Operator.
-
-Out is a scalar which is the mean of all elements in X. 
-
-)DOC");
-  }
-};
-
-class MeanGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", framework::GradVarName("X"));
-  }
-};
-
-class MeanGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* grad_op = new framework::OpDesc();
-    grad_op->SetType("mean_grad");
-    grad_op->SetInput("X", Input("X"));
-    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker);
-REGISTER_OPERATOR(mean_grad, ops::MeanGradOp);
-REGISTER_OP_CPU_KERNEL(
-    mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MeanKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    mean_grad, ops::MeanGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MeanGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/mean_op.cu b/paddle/operators/mean_op.cu
deleted file mode 100644
index 212d4481138c1478f6e3aa684008f9e42c5a3870..0000000000000000000000000000000000000000
--- a/paddle/operators/mean_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-
-#include "paddle/operators/mean_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    mean_grad, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/mean_op.h b/paddle/operators/mean_op.h
deleted file mode 100644
index 351b34595974b1771d9f4ae5232e0b3a33491104..0000000000000000000000000000000000000000
--- a/paddle/operators/mean_op.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class MeanKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-
-    output->mutable_data<T>(context.GetPlace());
-
-    auto X = EigenVector<T>::Flatten(*input);
-    auto y = EigenScalar<T>::From(*output);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    y.device(place) = X.mean();
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MeanGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto OG = context.Input<Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE(OG->numel() == 1, "Mean Gradient should be scalar");
-    auto IG = context.Output<Tensor>(framework::GradVarName("X"));
-    IG->mutable_data<T>(context.GetPlace());
-
-    T ig_size = static_cast<T>(IG->numel());
-    Eigen::DSizes<int, 1> bcast(ig_size);
-
-    EigenVector<T>::Flatten(*IG).device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        (EigenVector<T>::From(*OG) / ig_size).broadcast(bcast);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc
deleted file mode 100644
index 87644d316d42c7d9453a99b759214b24088062df..0000000000000000000000000000000000000000
--- a/paddle/operators/merge_lod_tensor_op.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/memory/memcpy.h"
-
-namespace paddle {
-namespace operators {
-
-using LoD = framework::LoD;
-
-class MergeLoDTensorOp : public framework::OperatorBase {
- public:
-  MergeLoDTensorOp(const std::string &type,
-                   const framework::VariableNameMap &inputs,
-                   const framework::VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-
-    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
-    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
-    auto &in_true = scope.FindVar(Input("InTrue"))->Get<framework::LoDTensor>();
-    auto &in_false =
-        scope.FindVar(Input("InFalse"))->Get<framework::LoDTensor>();
-    auto *out =
-        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
-    auto level = static_cast<size_t>(Attr<int>("level"));
-
-    auto &mask_dim = mask.dims();
-
-    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
-    if (platform::is_cpu_place(mask.place())) {
-      cpu_mask->ShareDataWith(mask);
-    } else if (platform::is_gpu_place(mask.place())) {
-#ifdef PADDLE_WITH_CUDA
-      framework::Copy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
-#else
-      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
-#endif
-    }
-    auto *mask_data = cpu_mask->data<bool>();
-
-    int rank = in_true.dims().size();
-    platform::Place place = in_true.place();
-    std::type_index data_type = in_true.type();
-    framework::DDim in_true_dims =
-        framework::slice_ddim(in_true.dims(), 1, rank);
-
-    int64_t batch_size = in_true.dims()[0] + in_false.dims()[0];
-
-    auto in_true_dim_vec = framework::vectorize(in_true_dims);
-    in_true_dim_vec.insert(in_true_dim_vec.begin(), batch_size);
-
-    framework::DDim out_dims = framework::make_ddim(in_true_dim_vec);
-    out->Resize(out_dims);
-    out->mutable_data(place, data_type);
-
-    auto *out_lod = out->mutable_lod();
-    out_lod->clear();
-    size_t out_offset = 0;
-
-    // Build LoDTensor `out`
-
-    size_t in_true_idx = 0;
-    size_t in_false_idx = 0;
-    for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
-      const framework::LoDTensor *input = nullptr;
-      size_t *in_idx = nullptr;
-      if (static_cast<int>(mask_data[i]) == 0) {
-        input = &in_false;
-        in_idx = &in_false_idx;
-      } else {
-        input = &in_true;
-        in_idx = &in_true_idx;
-      }
-      auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
-          input->lod(), *in_idx, (*in_idx) + 1, 0);
-      auto &lod_length = lod_and_offset.first;
-
-      framework::AppendLoD(out_lod, lod_length);
-
-      size_t start_offset = lod_and_offset.second.first;
-      size_t end_offset = lod_and_offset.second.second;
-
-      PADDLE_ENFORCE_GE(end_offset, start_offset);
-      size_t len = end_offset - start_offset;
-      if (len == 0) {
-        continue;
-      }
-      auto slice = out->Slice(out_offset, out_offset + len);
-      framework::Copy(input->Slice(start_offset, end_offset), place, dev_ctx,
-                      &slice);
-      out_offset += len;
-      (*in_idx) += 1;
-    }
-
-    for (size_t i = 0; i < level; i++) {
-      out_lod->insert(out_lod->begin(), x.lod()[i]);
-    }
-  }
-};
-
-class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  MergeLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "The input LoDTensor, contains complete lod information to "
-             "construct the output");
-    AddInput("Mask", "A bool column vector which mask the input");
-    AddInput("InTrue", "The True branch to be merged");
-    AddInput("InFalse", "The False branch to be merged");
-    AddOutput("Out", "The merged output LoDTensor");
-    AddAttr<int>("level", "(int) the specific lod level to rank.")
-        .SetDefault(0)
-        .EqualGreaterThan(0);
-    AddComment(
-        R"DOC(
-        Merge True and False branches of LoDTensor into a single Output,
-        with a mask at certain lod level. X is used to obtain complete
-        lod information. Please refer to SplitLoDTensorOp.)DOC");
-  }
-};
-
-class MergeLoDTensorInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"),
-                   "MergeLoDTensorOp must has input X.");
-    PADDLE_ENFORCE(context->HasInput("Mask"),
-                   "MergeLoDTensorOp must has input Mask.");
-    PADDLE_ENFORCE(context->HasInput("InTrue"),
-                   "MergeLoDTensorOp must has input InTrue.");
-    PADDLE_ENFORCE(context->HasInput("InFalse"),
-                   "MergeLoDTensorOp must has input InFalse.");
-    PADDLE_ENFORCE(context->HasOutput("Out"),
-                   "MergeLoDTensorOp must has output Out");
-
-    auto mask_dim = context->GetInputDim("Mask");
-    PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
-    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
-
-    context->SetOutputDim("Out", context->GetInputDim("InTrue"));
-  }
-};
-
-class MergeLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("split_lod_tensor");
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetInput("Mask", Input("Mask"));
-    grad_op->SetOutput("OutTrue", InputGrad("InTrue"));
-    grad_op->SetOutput("OutFalse", InputGrad("InFalse"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(merge_lod_tensor, ops::MergeLoDTensorOp,
-                  ops::MergeLoDTensorOpProtoMaker,
-                  ops::MergeLoDTensorInferShape, ops::MergeLoDTensorGradMaker);
diff --git a/paddle/operators/mine_hard_examples_op.cc b/paddle/operators/mine_hard_examples_op.cc
deleted file mode 100644
index 051cc24706d69ec4f38524af1dd510bf079c74c7..0000000000000000000000000000000000000000
--- a/paddle/operators/mine_hard_examples_op.cc
+++ /dev/null
@@ -1,330 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-enum MiningType { kNone = 0, kMaxNegative, kHardExample };
-
-template <typename T>
-bool SortScoreDescend(const std::pair<float, T>& pair1,
-                      const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-inline bool IsEligibleMining(const MiningType mining_type, const int match_idx,
-                             const float match_dist,
-                             const float neg_dist_threshold) {
-  if (mining_type == MiningType::kMaxNegative) {
-    return match_idx == -1 && match_dist < neg_dist_threshold;
-  } else if (mining_type == MiningType::kHardExample) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-inline MiningType GetMiningType(std::string str) {
-  if (str == "max_negative") {
-    return MiningType::kMaxNegative;
-  } else if (str == "hard_example") {
-    return MiningType::kHardExample;
-  } else {
-    return MiningType::kNone;
-  }
-}
-
-template <typename DeviceContext, typename T>
-class MineHardExamplesKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_cls_loss = ctx.Input<framework::Tensor>("ClsLoss");
-    auto* in_loc_loss = ctx.Input<framework::Tensor>("LocLoss");
-    auto* in_matched_indices = ctx.Input<framework::Tensor>("MatchIndices");
-    auto* in_match_dist = ctx.Input<framework::Tensor>("MatchDist");
-    float neg_pos_ratio = ctx.Attr<float>("neg_pos_ratio");
-    T neg_dist_threshold =
-        static_cast<T>(ctx.Attr<float>("neg_dist_threshold"));
-    int sample_size = ctx.Attr<int>("sample_size");
-    MiningType mining_type =
-        GetMiningType(ctx.Attr<std::string>("mining_type"));
-
-    auto out_neg_indices = ctx.Output<framework::LoDTensor>("NegIndices");
-    auto out_match_indices =
-        ctx.Output<framework::Tensor>("UpdatedMatchIndices");
-
-    framework::Copy(*in_matched_indices, ctx.GetPlace(), out_match_indices);
-
-    int batch_size = in_matched_indices->dims()[0];
-    int prior_num = in_matched_indices->dims()[1];
-
-    auto match_indices = framework::EigenMatrix<int>::From(*in_matched_indices);
-
-    auto match_indices_et =
-        framework::EigenMatrix<int>::From(*out_match_indices);
-
-    auto match_dist = framework::EigenMatrix<T>::From(*in_match_dist);
-
-    const T* cls_loss = in_cls_loss->data<T>();
-    const T* loc_loss = nullptr;
-    if (in_loc_loss) {
-      loc_loss = in_loc_loss->data<T>();
-    }
-
-    std::vector<std::vector<int>> all_neg_indices;
-    std::vector<size_t> batch_starts = {0};
-    for (int n = 0; n < batch_size; ++n) {
-      std::vector<std::pair<T, size_t>> loss_idx;
-      int neg_sel = 0;
-      for (int m = 0; m < prior_num; ++m) {
-        if (IsEligibleMining(mining_type, match_indices(n, m), match_dist(n, m),
-                             neg_dist_threshold)) {
-          T loss = cls_loss[n * prior_num + m];
-          if (mining_type == MiningType::kHardExample && loc_loss != nullptr) {
-            loss = cls_loss[n * prior_num + m] + loc_loss[n * prior_num + m];
-          }
-          loss_idx.push_back(std::make_pair(loss, m));
-          ++neg_sel;
-        }
-      }
-
-      if (mining_type == MiningType::kMaxNegative) {
-        int num_pos = 0;
-        for (int m = 0; m < prior_num; ++m) {
-          if (match_indices(n, m) != -1) ++num_pos;
-        }
-        neg_sel = std::min(static_cast<int>(num_pos * neg_pos_ratio), neg_sel);
-      } else if (mining_type == MiningType::kHardExample) {
-        neg_sel = std::min(sample_size, neg_sel);
-      }
-
-      std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend<size_t>);
-      std::set<int> sel_indices;
-      std::vector<int> neg_indices;
-      std::transform(loss_idx.begin(), loss_idx.begin() + neg_sel,
-                     std::inserter(sel_indices, sel_indices.begin()),
-                     [](std::pair<T, size_t>& l) -> int {
-                       return static_cast<int>(l.second);
-                     });
-
-      if (mining_type == MiningType::kHardExample) {
-        for (int m = 0; m < prior_num; ++m) {
-          if (match_indices(n, m) > -1) {
-            if (sel_indices.find(m) == sel_indices.end()) {
-              match_indices_et(n, m) = -1;
-            }
-          } else {
-            if (sel_indices.find(m) != sel_indices.end()) {
-              neg_indices.push_back(m);
-            }
-          }
-        }
-      } else {
-        neg_indices.resize(sel_indices.size());
-        std::copy(sel_indices.begin(), sel_indices.end(), neg_indices.begin());
-      }
-
-      all_neg_indices.push_back(neg_indices);
-      batch_starts.push_back(batch_starts.back() + neg_indices.size());
-    }
-
-    framework::LoD out_neg_indices_lod;
-    out_neg_indices_lod.emplace_back(batch_starts);
-    int neg_offset = 0;
-    auto neg_data = out_neg_indices->mutable_data<int>(
-        framework::make_ddim({static_cast<int>(batch_starts.back()), 1}),
-        ctx.GetPlace());
-
-    for (auto neg_indices : all_neg_indices) {
-      std::copy(neg_indices.begin(), neg_indices.end(), neg_data + neg_offset);
-      neg_offset += neg_indices.size();
-    }
-    out_neg_indices->set_lod(out_neg_indices_lod);
-    return;
-  }
-};
-
-class MineHardExamplesOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("ClsLoss"),
-                   "Input(ClsLoss) of MineHardExamplesOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("MatchIndices"),
-        "Input(MatchIndices) of MineHardExamplesOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("MatchDist"),
-        "Input(MatchDist) of MineHardExamplesOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("NegIndices"),
-        "Output(NegIndices) of MineHardExamplesOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("UpdatedMatchIndices"),
-                   "Output(UpdatedMatchIndices) of MineHardExamplesOp should "
-                   "not be null.");
-
-    auto cls_loss_dims = ctx->GetInputDim("ClsLoss");
-    auto idx_dims = ctx->GetInputDim("MatchIndices");
-    auto dis_dims = ctx->GetInputDim("MatchDist");
-
-    PADDLE_ENFORCE_EQ(cls_loss_dims.size(), 2UL,
-                      "The shape of ClsLoss is [N, Np].");
-    PADDLE_ENFORCE_EQ(idx_dims.size(), 2UL,
-                      "The shape of MatchIndices is [N, Np].");
-    PADDLE_ENFORCE_EQ(dis_dims.size(), 2UL,
-                      "The shape of MatchDist is [N, Np].");
-
-    if (ctx->HasInput("LocLoss")) {
-      auto loc_loss_dims = ctx->GetInputDim("LocLoss");
-      PADDLE_ENFORCE_EQ(loc_loss_dims.size(), 2UL,
-                        "The shape of LocLoss is [N, Np].");
-      PADDLE_ENFORCE_EQ(cls_loss_dims[0], loc_loss_dims[0],
-                        "Batch size of ClsLoss and LocLoss must be the same.");
-      PADDLE_ENFORCE_EQ(
-          cls_loss_dims[1], loc_loss_dims[1],
-          "Prior box number of ClsLoss and LocLoss must be the same.");
-    }
-
-    PADDLE_ENFORCE_EQ(
-        cls_loss_dims[0], idx_dims[0],
-        "Batch size of ClsLoss and MatchIndices must be the same.");
-    PADDLE_ENFORCE_EQ(
-        cls_loss_dims[1], idx_dims[1],
-        "Prior box number of ClsLoss and MatchIndices must be the same.");
-
-    PADDLE_ENFORCE_EQ(cls_loss_dims[0], dis_dims[0],
-                      "Batch size of ClsLoss and MatchDist must be the same.");
-    PADDLE_ENFORCE_EQ(
-        cls_loss_dims[1], idx_dims[1],
-        "Prior box number of ClsLoss and MatchDist must be the same.");
-
-    auto mining_type =
-        GetMiningType(ctx->Attrs().Get<std::string>("mining_type"));
-
-    PADDLE_ENFORCE_NE(mining_type, MiningType::kNone,
-                      "mining_type must be hard_example or max_negative");
-
-    if (mining_type == MiningType::kMaxNegative) {
-      auto neg_pos_ratio = ctx->Attrs().Get<float>("neg_pos_ratio");
-      auto neg_dist_threshold = ctx->Attrs().Get<float>("neg_dist_threshold");
-      PADDLE_ENFORCE_GT(
-          neg_pos_ratio, 0.0f,
-          "neg_pos_ratio must greater than zero in max_negative mode");
-      PADDLE_ENFORCE_GT(
-          neg_dist_threshold, 0.0f,
-          "neg_dist_threshold must greater than zero in max_negative mode");
-    } else if (mining_type == MiningType::kHardExample) {
-      auto sample_size = ctx->Attrs().Get<int>("sample_size");
-      PADDLE_ENFORCE_GT(
-          sample_size, 0,
-          "sample_size must greater than zero in hard_example mode");
-    }
-
-    ctx->SetOutputDim("UpdatedMatchIndices", idx_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("ClsLoss")->type()),
-        ctx.device_context());
-  }
-};
-
-class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  MineHardExamplesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "ClsLoss",
-        "(Tensor, default Tensor<float>), The classification loss with shape "
-        "[N, Np], N is the batch size and Np is the number of prior box.");
-    AddInput("LocLoss",
-             "(Tensor, optional, default Tensor<float>), The localization loss "
-             "with shape [N, Np], N is the batch size and Np is the number of "
-             "prior box.")
-        .AsDispensable();
-    AddInput("MatchIndices",
-             "(Tensor, Tensor<int>), Matched indices with shape [N, Np], N is "
-             "the batch size and Np is the number of prior box. "
-             "MatchIndices[i][j] equal -1 means the j-th prior box in i-th "
-             "instance does not match any entity, otherwise means it is "
-             "matched to row.");
-    AddInput("MatchDist",
-             "(Tensor, default Tensor<float>) Matched indices with shape [N, "
-             "Np], N is the batch size and Np is the number of prior box.");
-    AddAttr<float>("neg_pos_ratio",
-                   "(float) The ratio of the negative box to the positive "
-                   "box. Use only when mining_type is max_negative.")
-        .SetDefault(1.0);
-    AddAttr<float>("neg_dist_threshold",
-                   "(float) The negative overlap upper bound for the unmatched "
-                   "predictions. Use only when mining_type is max_negative.")
-        .SetDefault(0.5);
-    AddAttr<int>("sample_size",
-                 "(float) The max sample size of negative box. Use only when "
-                 "mining_type is hard_example.")
-        .SetDefault(0);
-    AddAttr<std::string>("mining_type",
-                         "(float) The mining algorithm name, the value is "
-                         "hard_example or max_negative.")
-        .SetDefault("max_negative")
-        .InEnum({"hard_example", "max_negative"});
-
-    AddOutput(
-        "NegIndices",
-        "(LoDTensor<int>) The output of negative example indices. a LoDTensor "
-        "with shape [Neg, 1]. The size of lod[0] minus 1 is batch size, "
-        "and each element is the prior box index. "
-        "For example, the batch size is 2, the lod is [[0, 1, 2]], "
-        "the sample 0's box 1(MatchIndices[0][1]) is selected, "
-        "and sample 1's box 0 is selected. The output NegIndices is "
-        "[[1], [0]].");
-
-    AddOutput("UpdatedMatchIndices",
-              "(Tensor<int>) The output of updated MatchIndices, a tensor with "
-              "shape [N, Np]. Only update when mining_type is "
-              "hard_example. The input MatchIndices elements will be update to "
-              "-1 when it is not in the candidate high loss list of negative "
-              "examples.");
-
-    AddComment(R"DOC(
-Mine hard examples Operator.
-This operator implements hard example mining to select a subset of negative box indices.
-For each image, selects the box with highest losses. subject to the condition that the 
-box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative. 
-The selected number is min(sample_size, max_negative_box_number) when mining_type is 
-hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number) 
-when mining_type is max_negative, where the max_negative_box_number is the count of 
-MatchIndices elements with value -1.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(mine_hard_examples, ops::MineHardExamplesOp,
-                             ops::MineHardExamplesOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    mine_hard_examples,
-    ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
deleted file mode 100644
index 3d7742dd4bc2a3c727279bc1e6c7dd47b96eefa3..0000000000000000000000000000000000000000
--- a/paddle/operators/minus_op.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/minus_op.h"
-#include "paddle/operators/net_op.h"
-
-namespace paddle {
-namespace operators {
-
-class MinusOp : public framework::OperatorWithKernel {
- public:
-  MinusOp(const std::string &type, const framework::VariableNameMap &inputs,
-          const framework::VariableNameMap &outputs,
-          const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of MinusOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"),
-                   "Input(Y) of MinusOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of MinusOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_EQ(
-        x_dims, y_dims,
-        "Minus operator must take two tensor with same num of elements");
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The left tensor of minus operator.");
-    AddInput("Y", "The right tensor of minus operator.");
-    AddOutput("Out", "The output tensor of minus operator.");
-
-    AddComment(R"DOC(
-Minus Operator.
-
-Equation:
-
-    $Out = X - Y$
-
-Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input `X`.
-
-)DOC");
-  }
-};
-
-class MinusGradMaker : public framework::GradOpDescMakerBase {
- public:
-  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
-
-  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
-    std::vector<std::unique_ptr<framework::OpDesc>> ops;
-    auto x_g = InputGrad("X");
-    if (!x_g.empty()) {
-      auto *x_g_op = new framework::OpDesc();
-      x_g_op->SetType("scale");
-      x_g_op->SetInput("X", OutputGrad("Out"));
-      x_g_op->SetOutput("Out", x_g);
-      x_g_op->SetAttr("scale", 1.0f);
-      ops.emplace_back(x_g_op);
-    }
-
-    auto y_g = InputGrad("Y");
-    if (!y_g.empty()) {
-      auto *y_g_op = new framework::OpDesc();
-      y_g_op->SetType("scale");
-      y_g_op->SetInput("X", OutputGrad("Out"));
-      y_g_op->SetOutput("Out", y_g);
-      y_g_op->SetAttr("scale", -1.0f);
-      ops.emplace_back(y_g_op);
-    }
-
-    return ops;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(minus, ops::MinusOp, ops::MinusOpMaker, ops::MinusGradMaker);
-REGISTER_OP_CPU_KERNEL(
-    minus, ops::MinusKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/minus_op.cu b/paddle/operators/minus_op.cu
deleted file mode 100644
index 80cd9f7c16845904b7b46ae1597ce9558c32f46a..0000000000000000000000000000000000000000
--- a/paddle/operators/minus_op.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/minus_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    minus,
-    paddle::operators::MinusKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/minus_op.h b/paddle/operators/minus_op.h
deleted file mode 100644
index 20760b8cd5bd2f74ed8469addda8f67f11f4545c..0000000000000000000000000000000000000000
--- a/paddle/operators/minus_op.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class MinusKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* left_tensor = context.Input<framework::Tensor>("X");
-    auto* right_tensor = context.Input<framework::Tensor>("Y");
-    auto* out_tensor = context.Output<framework::Tensor>("Out");
-
-    out_tensor->mutable_data<T>(context.GetPlace());
-    auto& dev =
-        *context.template device_context<DeviceContext>().eigen_device();
-    framework::EigenVector<T>::Flatten(*out_tensor).device(dev) =
-        framework::EigenVector<T>::Flatten(*left_tensor) -
-        framework::EigenVector<T>::Flatten(*right_tensor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/modified_huber_loss_op.cc b/paddle/operators/modified_huber_loss_op.cc
deleted file mode 100644
index f5d69071a86e3f8037840c091cf5b7683e4eeb96..0000000000000000000000000000000000000000
--- a/paddle/operators/modified_huber_loss_op.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/modified_huber_loss_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ModifiedHuberLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same.");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The tensor rank of X must be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[1], 1, "The 2nd dimension of X must be 1.");
-
-    ctx->SetOutputDim("IntermediateVal", x_dims);
-    ctx->SetOutputDim("Out", {x_dims[0], 1});
-  }
-};
-
-class ModifiedHuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ModifiedHuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "The input tensor of modified huber loss op. "
-             "X is 2-D tensor with shape [batch_size, 1].");
-    AddInput("Y",
-             "The target labels of modified huber loss op. "
-             "The shape of Y is the same as X. Values of Y must be 0 or 1.");
-    AddOutput("IntermediateVal",
-              "Variable to save intermediate result which will be reused in "
-              "backward processing.")
-        .AsIntermediate();
-    AddOutput("Out", "Classification loss for X.");
-    AddComment(R"DOC(
-Modified Huber Loss Operator.
-
-This operator is used in binary classification problem. The shape of
-input X and target Y are both [N, 1] and so is the shape of the output loss.
-Since target Y is not differentiable, calculating gradient for Y is illegal.
-The formula of modified huber loss is:
-
-$$
-L(y, f(x)) = 
-\begin{cases}
-(\max(0, 1 - yf(x)))^2,  \text{if} \  yf(x) >= -1    \\
-             -4yf(x),    \quad \text{otherwise}
-\end{cases}
-$$
-
-Make sure the values of target label Y are in {0, 1} here. This operator will
-scale values of Y to {-1, +1} when computing losses and gradients.
-
-)DOC");
-  }
-};
-
-class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
-    PADDLE_ENFORCE(ctx->HasInput("IntermediateVal"),
-                   "Intermediate value must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@Grad) must not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    auto intermediate_dims = ctx->GetInputDim("IntermediateVal");
-    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_EQ(
-        intermediate_dims, x_dims,
-        "The shape of X and intermediate value must be the same.");
-    PADDLE_ENFORCE_EQ(out_grad_dims, x_dims,
-                      "The shape of Input(Out@Grad) and X must be the same.");
-
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(modified_huber_loss, ops::ModifiedHuberLossOp,
-            ops::ModifiedHuberLossOpMaker, modified_huber_loss_grad,
-            ops::ModifiedHuberLossGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    modified_huber_loss,
-    ops::ModifiedHuberLossKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(modified_huber_loss_grad,
-                       ops::ModifiedHuberLossGradCPUKernel<float>);
diff --git a/paddle/operators/modified_huber_loss_op.cu b/paddle/operators/modified_huber_loss_op.cu
deleted file mode 100644
index 3d2a5562e8cc2117b0b460496d9ba8e96823fbfb..0000000000000000000000000000000000000000
--- a/paddle/operators/modified_huber_loss_op.cu
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <thrust/for_each.h>
-#include <thrust/tuple.h>
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/modified_huber_loss_op.h"
-#include "paddle/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-struct ModifiedHuberLossBackward {
-  template <typename Tuple>
-  HOSTDEVICE void operator()(Tuple t) const {
-    auto inter_val = thrust::get<1>(t);
-    auto y_val = thrust::get<2>(t);
-    auto out_grad = thrust::get<3>(t);
-    if (inter_val < -1) {
-      thrust::get<0>(t) = -4 * (2 * y_val - 1) * out_grad;
-    } else if (inter_val < 1) {
-      thrust::get<0>(t) = -2 * (1 - inter_val) * (2 * y_val - 1) * out_grad;
-    } else {
-      thrust::get<0>(t) = 0;
-    }
-  }
-};
-
-template <typename T>
-class ModifiedHuberLossGradGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("Y");
-    auto* in1 = context.Input<Tensor>("IntermediateVal");
-    auto* in2 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-
-    if (out0) {
-      auto counts = framework::product(in1->dims());
-      auto y_ptr = thrust::device_pointer_cast(in0->data<T>());
-      auto inter_val_ptr = thrust::device_pointer_cast(in1->data<T>());
-      auto out_grad_ptr = thrust::device_pointer_cast(in2->data<T>());
-      thrust::device_ptr<T> x_grad_ptr(
-          out0->mutable_data<T>(context.GetPlace()));
-
-      auto iter_begin = thrust::make_zip_iterator(
-          thrust::make_tuple(x_grad_ptr, inter_val_ptr, y_ptr, out_grad_ptr));
-
-      auto iter_end = thrust::make_zip_iterator(
-          thrust::make_tuple(x_grad_ptr + counts, inter_val_ptr + counts,
-                             y_ptr + counts, out_grad_ptr + counts));
-
-      thrust::for_each(iter_begin, iter_end, ModifiedHuberLossBackward());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    modified_huber_loss,
-    ops::ModifiedHuberLossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(modified_huber_loss_grad,
-                        ops::ModifiedHuberLossGradGPUKernel<float>);
diff --git a/paddle/operators/modified_huber_loss_op.h b/paddle/operators/modified_huber_loss_op.h
deleted file mode 100644
index 6ce86feee574efca8811f316f47f1c3fbbdd0bf9..0000000000000000000000000000000000000000
--- a/paddle/operators/modified_huber_loss_op.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename T>
-struct CheckLabelValue {
-  HOSTDEVICE T operator()(const T& val) const {
-    PADDLE_ASSERT(val == static_cast<T>(0) || val == static_cast<T>(1));
-  }
-};
-
-template <typename T>
-struct ModifiedHuberLossForward {
-  HOSTDEVICE T operator()(const T& val) const {
-    if (val < -1) {
-      return -4 * val;
-    } else if (val < 1) {
-      return (1 - val) * (1 - val);
-    } else {
-      return static_cast<T>(0);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ModifiedHuberLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto* in1 = context.Input<Tensor>("Y");
-    auto* out0 = context.Output<framework::Tensor>("IntermediateVal");
-    auto* out1 = context.Output<framework::Tensor>("Out");
-
-    out0->mutable_data<T>(context.GetPlace());
-    out1->mutable_data<T>(context.GetPlace());
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto x = EigenVector<T>::Flatten(*in0);
-    auto y = EigenVector<T>::Flatten(*in1);
-    // make sure value's of Y in {0, 1}
-    y.unaryExpr(CheckLabelValue<T>());
-    auto inter_val = EigenVector<T>::Flatten(*out0);
-    // scale y to {-1, +1} and compute x * y
-    inter_val.device(place) = x * (2 * y - static_cast<T>(1));
-    auto loss = EigenVector<T>::Flatten(*out1);
-    loss.device(place) = inter_val.unaryExpr(ModifiedHuberLossForward<T>());
-  }
-};
-
-// CPU backward kernel
-template <typename T>
-class ModifiedHuberLossGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("Y");
-    auto* in1 = context.Input<framework::Tensor>("IntermediateVal");
-    auto* in2 = context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    if (out0) {
-      const T* y_ptr = in0->data<T>();
-      const T* inter_val_ptr = in1->data<T>();
-      const T* out_grad_ptr = in2->data<T>();
-      size_t counts = static_cast<size_t>(framework::product(in1->dims()));
-      T* x_grad_ptr = out0->mutable_data<T>(context.GetPlace());
-      for (size_t i = 0; i < counts; ++i) {
-        if (inter_val_ptr[i] < -1) {
-          x_grad_ptr[i] = -4 * (2 * y_ptr[i] - 1) * out_grad_ptr[i];
-        } else if (inter_val_ptr[i] < 1) {
-          x_grad_ptr[i] = -2 * (1 - inter_val_ptr[i]) * (2 * y_ptr[i] - 1) *
-                          out_grad_ptr[i];
-        } else {
-          x_grad_ptr[i] = 0;
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/momentum_op.cc b/paddle/operators/momentum_op.cc
deleted file mode 100644
index 15b8b80776732f43c3ef4f8b80cffedf5c2a76fd..0000000000000000000000000000000000000000
--- a/paddle/operators/momentum_op.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/momentum_op.h"
-
-namespace paddle {
-namespace operators {
-
-class MomentumOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(param) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(grad) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Velocity"),
-                   "Input(velocity) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of Momentum should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"),
-                   "Output(VelocityOut) of Momentum should not be null.");
-
-    auto param_dim = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dim, ctx->GetInputDim("Grad"),
-        "Param and Grad input of MomentumOp should have the same dimension.");
-    PADDLE_ENFORCE_EQ(
-        param_dim, ctx->GetInputDim("Velocity"),
-        "Param and Velocity of MomentumOp should have the same dimension.");
-    PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1,
-                      "Learning_rate should be a scalar");
-
-    ctx->SetOutputDim("ParamOut", param_dim);
-    ctx->SetOutputDim("VelocityOut", param_dim);
-  }
-};
-
-class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  MomentumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Param",
-             "(Tensor, default Tensor<float>) "
-             "Input parameter that has to be updated");
-    AddInput("Grad",
-             "(Tensor, default Tensor<float>) "
-             "Input gradient of the parameter");
-    AddInput("Velocity",
-             "(Tensor, default Tensor<float>) "
-             "Input velocity (corresponding to the parameter) "
-             "that has to be updated");
-    AddInput("LearningRate",
-             "(Tensor, default Tensor<float>) "
-             "Input learning rate");
-
-    AddOutput("ParamOut",
-              "(Tensor) This output is updated parameter. "
-              "It shared memory with Input(Param).");
-    AddOutput("VelocityOut",
-              "(Tensor) This output is updated velocity. "
-              "It shared memory with Input(Velocity).");
-
-    AddAttr<float>("mu", "(float) Momentum coefficient");
-    AddAttr<bool>("use_nesterov",
-                  "(bool, default false) "
-                  "Use Nesterov Momentum")
-        .SetDefault(false);
-    AddComment(R"DOC(
-Momentum Optimizer.
-
-This optimizer has a flag for Nestrov Momentum.
-The update equations are as follows:
-
-$$
-velocity = mu * velocity + gradient \\
-if (use\_nesterov):   \\
-  param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\
-else:   \\
-  param = param - learning\_rate * velocity. \\
-$$
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(momentum, ops::MomentumOp, ops::MomentumOpMaker);
-REGISTER_OP_CPU_KERNEL(momentum, ops::MomentumOpKernel<float>,
-                       ops::MomentumOpKernel<double>);
diff --git a/paddle/operators/momentum_op.cu b/paddle/operators/momentum_op.cu
deleted file mode 100644
index 2b9314162e6f10b3791c913203c732d2822861ab..0000000000000000000000000000000000000000
--- a/paddle/operators/momentum_op.cu
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void MomentumKernel(const T* p, const T* g, const T* v,
-                               const T* learning_rate, const T mu,
-                               const int64_t num, bool use_nesterov, T* p_out,
-                               T* v_out) {
-  T lr = learning_rate[0];
-  if (use_nesterov) {
-    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-         i += blockDim.x * gridDim.x) {
-      T g_val = g[i];
-      T v_new = v[i] * mu + g_val;
-      v_out[i] = v_new;
-      p_out[i] = p[i] - (g_val - v_new * mu) * lr;
-    }
-  } else {
-    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-         i += blockDim.x * gridDim.x) {
-      T v_new = v[i] * mu + g[i];
-      v_out[i] = v_new;
-      p_out[i] = p[i] - lr * v_new;
-    }
-  }
-}
-
-template <typename T>
-class MomentumOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
-    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
-    auto param = ctx.Input<framework::Tensor>("Param");
-    auto velocity = ctx.Input<framework::Tensor>("Velocity");
-    auto grad = ctx.Input<framework::Tensor>("Grad");
-    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-
-    T* p_out = param_out->mutable_data<T>(ctx.GetPlace());
-    T* v_out = velocity_out->mutable_data<T>(ctx.GetPlace());
-
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
-
-    auto* p = param->data<T>();
-    auto* v = velocity->data<T>();
-    auto* g = grad->data<T>();
-    auto* lr = learning_rate->data<T>();
-
-    int block = 512;
-    int grid = (param->numel() + block - 1) / block;
-    MomentumKernel<T><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
-        p, g, v, lr, mu, param->numel(), use_nesterov, p_out, v_out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(momentum, ops::MomentumOpCUDAKernel<float>,
-                        ops::MomentumOpCUDAKernel<double>);
diff --git a/paddle/operators/momentum_op.h b/paddle/operators/momentum_op.h
deleted file mode 100644
index da69532ea58bad8d3908770d82dbcc6e6b108fce..0000000000000000000000000000000000000000
--- a/paddle/operators/momentum_op.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class MomentumOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
-    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
-    auto param = ctx.Input<framework::Tensor>("Param");
-    auto velocity = ctx.Input<framework::Tensor>("Velocity");
-    auto grad = ctx.Input<framework::Tensor>("Grad");
-    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-    velocity_out->mutable_data<T>(ctx.GetPlace());
-
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
-
-    auto p_out = framework::EigenVector<T>::Flatten(*param_out);
-    auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
-
-    auto p = framework::EigenVector<T>::Flatten(*param);
-    auto v = framework::EigenVector<T>::Flatten(*velocity);
-    auto g = framework::EigenVector<T>::Flatten(*grad);
-    auto* lr = learning_rate->data<T>();
-
-    v_out = v * mu + g;
-    if (use_nesterov) {
-      p_out = p - (g - v_out * mu) * lr[0];
-    } else {
-      p_out = p - lr[0] * v_out;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
deleted file mode 100644
index c923e988a55b43ebb7ba6256e7b72a85c124f360..0000000000000000000000000000000000000000
--- a/paddle/operators/mul_op.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/mul_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class MulOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of MulOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    int x_num_col_dims = ctx->Attrs().Get<int>("x_num_col_dims");
-    int y_num_col_dims = ctx->Attrs().Get<int>("y_num_col_dims");
-
-    VLOG(3) << "mul operator x.shape=" << x_dims << " y.shape=" << y_dims
-            << " x_num_col_dims=" << x_num_col_dims
-            << " y_num_col_dims=" << y_num_col_dims;
-
-    PADDLE_ENFORCE_GT(
-        x_dims.size(), x_num_col_dims,
-        "The input tensor X's rank of MulOp should be larger than "
-        "x_num_col_dims.");
-    PADDLE_ENFORCE_GT(
-        y_dims.size(), y_num_col_dims,
-        "The input tensor Y's rank of MulOp should be larger than "
-        "y_num_col_dims.");
-
-    auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
-    auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
-
-    PADDLE_ENFORCE_EQ(
-        x_mat_dims[1], y_mat_dims[0],
-        "First matrix's width must be equal with second matrix's height.");
-    std::vector<int64_t> output_dims;
-    output_dims.reserve(
-        static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
-
-    for (int i = 0; i < x_num_col_dims; ++i) {
-      output_dims.push_back(x_dims[i]);
-    }
-
-    for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
-      output_dims.push_back(y_dims[i]);
-    }
-
-    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class MulOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  MulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor), The first input tensor of mul op.");
-    AddInput("Y", "(Tensor), The second input tensor of mul op.");
-    AddOutput("Out", "(Tensor), The output tensor of mul op.");
-    AddAttr<int>(
-        "x_num_col_dims",
-        R"DOC((int, default 1), The mul_op can take tensors with more than two
-              dimensions as its inputs. If the input $X$ is a tensor with more
-              than two dimensions, $X$ will be flattened into a two-dimensional
-              matrix first. The flattening rule is: the first `num_col_dims`
-              will be flattened to form the first dimension of the final matrix
-              (the height of the matrix), and the rest `rank(X) - num_col_dims`
-              dimensions are flattened to form the second dimension of the final
-              matrix (the width of the matrix). As a result, height of the
-              flattened matrix is equal to the product of $X$'s first
-              `x_num_col_dims` dimensions' sizes, and width of the flattened
-              matrix is equal to the product of $X$'s last `rank(x) - num_col_dims`
-              dimensions' size. For example, suppose $X$ is a 6-dimensional
-              tensor with the shape [2, 3, 4, 5, 6], and `x_num_col_dims` = 3.
-              Thus, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] =
-              [24, 30].
-        )DOC")
-        .SetDefault(1)
-        .EqualGreaterThan(1);
-    AddAttr<int>(
-        "y_num_col_dims",
-        R"DOC((int, default 1), The mul_op can take tensors with more than two,
-              dimensions as its inputs. If the input $Y$ is a tensor with more
-              than two dimensions, $Y$ will be flattened into a two-dimensional
-              matrix first. The attribute `y_num_col_dims` determines how $Y$ is
-              flattened. See comments of `x_num_col_dims` for more details.
-        )DOC")
-        .SetDefault(1)
-        .EqualGreaterThan(1);
-    AddComment(R"DOC(
-Mul Operator.
-
-This operator is used to perform matrix multiplication for input $X$ and $Y$.
-
-The equation is:
-
-$$Out = X * Y$$
-
-Both the input $X$ and $Y$ can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input $X$.
-
-)DOC");
-  }
-};
-
-class MulOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    auto x_mat_dims = framework::flatten_to_2d(
-        x_dims, ctx->Attrs().Get<int>("x_num_col_dims"));
-    auto y_mat_dims = framework::flatten_to_2d(
-        y_dims, ctx->Attrs().Get<int>("y_num_col_dims"));
-
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, y_dims);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(mul, paddle::framework::OperatorWithKernel, ops::MulOpMaker,
-                  ops::MulOpShapeInference,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(mul_grad, ops::MulOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/mul_op.cu.cc b/paddle/operators/mul_op.cu.cc
deleted file mode 100644
index 43de9a719499e4e0e8fd2e5fcc6771d717ce6522..0000000000000000000000000000000000000000
--- a/paddle/operators/mul_op.cu.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/mul_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    mul_grad, ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
deleted file mode 100644
index 1fb0569b49cce80c3f1e408fb57b5f5cf7033a27..0000000000000000000000000000000000000000
--- a/paddle/operators/mul_op.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/operators/math/math_function.h"
-
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class MulKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    const Tensor* y = context.Input<Tensor>("Y");
-    Tensor* z = context.Output<Tensor>("Out");
-    const Tensor x_matrix =
-        x->dims().size() > 2
-            ? framework::ReshapeToMatrix(
-                  *x, context.template Attr<int>("x_num_col_dims"))
-            : *x;
-    const Tensor y_matrix =
-        y->dims().size() > 2
-            ? framework::ReshapeToMatrix(
-                  *y, context.template Attr<int>("y_num_col_dims"))
-            : *y;
-
-    z->mutable_data<T>(context.GetPlace());
-    auto z_dim = z->dims();
-    if (z_dim.size() != 2) {
-      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-    }
-    math::matmul<DeviceContext, T>(
-        context.template device_context<DeviceContext>(), x_matrix, false,
-        y_matrix, false, 1, z, 0);
-    if (z_dim.size() != 2) {
-      z->Resize(z_dim);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MulGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
-    int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* y = ctx.Input<Tensor>("Y");
-    const Tensor x_matrix = x->dims().size() > 2
-                                ? framework::ReshapeToMatrix(*x, x_num_col_dims)
-                                : *x;
-    const Tensor y_matrix = y->dims().size() > 2
-                                ? framework::ReshapeToMatrix(*y, y_num_col_dims)
-                                : *y;
-    const Tensor* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    Tensor dout_mat;
-    dout_mat.ShareDataWith(*dout);
-    dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0],
-                     framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
-
-    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      Tensor dx_matrix = dx->dims().size() > 2
-                             ? framework::ReshapeToMatrix(*dx, x_num_col_dims)
-                             : *dx;
-
-      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      math::matmul<DeviceContext, T>(dev_ctx, dout_mat, false, y_matrix, true,
-                                     1, &dx_matrix, 0);
-    }
-    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-      Tensor dy_matrix = dy->dims().size() > 2
-                             ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
-                             : *dy;
-      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      math::matmul<DeviceContext, T>(dev_ctx, x_matrix, true, dout_mat, false,
-                                     1, &dy_matrix, 0);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/multiclass_nms_op.cc b/paddle/operators/multiclass_nms_op.cc
deleted file mode 100644
index 41b9335fb8fc9ef5f5336bd8c63dc68bb94ff4f6..0000000000000000000000000000000000000000
--- a/paddle/operators/multiclass_nms_op.cc
+++ /dev/null
@@ -1,384 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-constexpr int64_t kOutputDim = 6;
-constexpr int64_t kBBoxSize = 4;
-
-class MultiClassNMSOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("BBoxes"),
-                   "Input(BBoxes) of MultiClassNMS should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Scores"),
-                   "Input(Scores) of MultiClassNMS should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of MultiClassNMS should not be null.");
-
-    auto box_dims = ctx->GetInputDim("BBoxes");
-    auto score_dims = ctx->GetInputDim("Scores");
-
-    PADDLE_ENFORCE_EQ(box_dims.size(), 2,
-                      "The rank of Input(BBoxes) must be 2.");
-    PADDLE_ENFORCE_EQ(score_dims.size(), 3,
-                      "The rank of Input(Scores) must be 3.");
-    PADDLE_ENFORCE_EQ(box_dims[1], 4,
-                      "The 2nd dimension of Input(BBoxes) must be 4, "
-                      "represents the layout of coordinate "
-                      "[xmin, ymin, xmax, ymax]");
-    PADDLE_ENFORCE_EQ(box_dims[0], score_dims[2],
-                      "The 1st dimensiong of Input(BBoxes) must be equal to "
-                      "3rd dimension of Input(Scores), which represents the "
-                      "predicted bboxes.");
-
-    // Here the box_dims[0] is not the real dimension of output.
-    // It will be rewritten in the computing kernel.
-    ctx->SetOutputDim("Out", {box_dims[0], 6});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<framework::LoDTensor>("Scores")->type()),
-        ctx.device_context());
-  }
-};
-
-template <class T>
-bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                          const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <class T>
-static inline void GetMaxScoreIndex(
-    const std::vector<T>& scores, const T threshold, int top_k,
-    std::vector<std::pair<T, int>>* sorted_indices) {
-  for (size_t i = 0; i < scores.size(); ++i) {
-    if (scores[i] > threshold) {
-      sorted_indices->push_back(std::make_pair(scores[i], i));
-    }
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
-                   SortScorePairDescend<int>);
-  // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
-    sorted_indices->resize(top_k);
-  }
-}
-
-template <class T>
-static inline T BBoxArea(const T* box, const bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <class T>
-static inline T JaccardOverlap(const T* box1, const T* box2,
-                               const bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = inter_xmax - inter_xmin;
-    const T inter_h = inter_ymax - inter_ymin;
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <typename T>
-class MultiClassNMSKernel : public framework::OpKernel<T> {
- public:
-  void NMSFast(const Tensor& bbox, const Tensor& scores,
-               const T score_threshold, const T nms_threshold, const T eta,
-               const int64_t top_k, std::vector<int>* selected_indices) const {
-    // The total boxes for each instance.
-    int64_t num_boxes = bbox.dims()[0];
-    // 4: [xmin ymin xmax ymax]
-    int64_t box_size = bbox.dims()[1];
-
-    std::vector<T> scores_data(num_boxes);
-    std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
-    std::vector<std::pair<T, int>> sorted_indices;
-    GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
-
-    selected_indices->clear();
-    T adaptive_threshold = nms_threshold;
-    const T* bbox_data = bbox.data<T>();
-
-    while (sorted_indices.size() != 0) {
-      const int idx = sorted_indices.front().second;
-      bool keep = true;
-      for (size_t k = 0; k < selected_indices->size(); ++k) {
-        if (keep) {
-          const int kept_idx = (*selected_indices)[k];
-          T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                        bbox_data + kept_idx * box_size, true);
-          keep = overlap <= adaptive_threshold;
-        } else {
-          break;
-        }
-      }
-      if (keep) {
-        selected_indices->push_back(idx);
-      }
-      sorted_indices.erase(sorted_indices.begin());
-      if (keep && eta < 1 && adaptive_threshold > 0.5) {
-        adaptive_threshold *= eta;
-      }
-    }
-  }
-
-  void MultiClassNMS(const framework::ExecutionContext& ctx,
-                     const Tensor& scores, const Tensor& bboxes,
-                     std::map<int, std::vector<int>>& indices,
-                     int& num_nmsed_out) const {
-    int64_t background_label = ctx.Attr<int>("background_label");
-    int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
-    int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
-    T nms_threshold = static_cast<T>(ctx.Attr<float>("nms_threshold"));
-    T nms_eta = static_cast<T>(ctx.Attr<float>("nms_eta"));
-    T score_threshold = static_cast<T>(ctx.Attr<float>("score_threshold"));
-
-    int64_t class_num = scores.dims()[0];
-    int64_t predict_dim = scores.dims()[1];
-    int num_det = 0;
-    for (int64_t c = 0; c < class_num; ++c) {
-      if (c == background_label) continue;
-      Tensor score = scores.Slice(c, c + 1);
-      NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k,
-              &(indices[c]));
-      num_det += indices[c].size();
-    }
-
-    num_nmsed_out = num_det;
-    const T* scores_data = scores.data<T>();
-    if (keep_top_k > -1 && num_det > keep_top_k) {
-      std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-      for (const auto& it : indices) {
-        int label = it.first;
-        const T* sdata = scores_data + label * predict_dim;
-        const std::vector<int>& label_indices = it.second;
-        for (size_t j = 0; j < label_indices.size(); ++j) {
-          int idx = label_indices[j];
-          PADDLE_ENFORCE_LT(idx, predict_dim);
-          score_index_pairs.push_back(
-              std::make_pair(sdata[idx], std::make_pair(label, idx)));
-        }
-      }
-      // Keep top k results per image.
-      std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
-                       SortScorePairDescend<std::pair<int, int>>);
-      score_index_pairs.resize(keep_top_k);
-
-      // Store the new indices.
-      std::map<int, std::vector<int>> new_indices;
-      for (size_t j = 0; j < score_index_pairs.size(); ++j) {
-        int label = score_index_pairs[j].second.first;
-        int idx = score_index_pairs[j].second.second;
-        new_indices[label].push_back(idx);
-      }
-      new_indices.swap(indices);
-      num_nmsed_out = keep_top_k;
-    }
-  }
-
-  void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
-                        std::map<int, std::vector<int>>& selected_indices,
-                        Tensor* outs) const {
-    int predict_dim = scores.dims()[1];
-    auto* scores_data = scores.data<T>();
-    auto* bboxes_data = bboxes.data<T>();
-    auto* odata = outs->data<T>();
-
-    int count = 0;
-    for (const auto& it : selected_indices) {
-      int label = it.first;
-      const T* sdata = scores_data + label * predict_dim;
-      const std::vector<int>& indices = it.second;
-      for (size_t j = 0; j < indices.size(); ++j) {
-        int idx = indices[j];
-        const T* bdata = bboxes_data + idx * kBBoxSize;
-        odata[count * kOutputDim] = label;           // label
-        odata[count * kOutputDim + 1] = sdata[idx];  // score
-        // xmin, ymin, xmax, ymax
-        std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T));
-        count++;
-      }
-    }
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* boxes = ctx.Input<Tensor>("BBoxes");
-    auto* scores = ctx.Input<Tensor>("Scores");
-    auto* outs = ctx.Output<LoDTensor>("Out");
-
-    auto score_dims = scores->dims();
-
-    int64_t batch_size = score_dims[0];
-    int64_t class_num = score_dims[1];
-    int64_t predict_dim = score_dims[2];
-
-    std::vector<std::map<int, std::vector<int>>> all_indices;
-    std::vector<size_t> batch_starts = {0};
-    for (int64_t i = 0; i < batch_size; ++i) {
-      Tensor ins_score = scores->Slice(i, i + 1);
-      ins_score.Resize({class_num, predict_dim});
-      std::map<int, std::vector<int>> indices;
-      int num_nmsed_out = 0;
-      MultiClassNMS(ctx, ins_score, *boxes, indices, num_nmsed_out);
-      all_indices.push_back(indices);
-      batch_starts.push_back(batch_starts.back() + num_nmsed_out);
-    }
-
-    int num_kept = batch_starts.back();
-    if (num_kept == 0) {
-      T* od = outs->mutable_data<T>({1}, ctx.GetPlace());
-      od[0] = -1;
-    } else {
-      outs->mutable_data<T>({num_kept, kOutputDim}, ctx.GetPlace());
-      for (int64_t i = 0; i < batch_size; ++i) {
-        Tensor ins_score = scores->Slice(i, i + 1);
-        ins_score.Resize({class_num, predict_dim});
-        int64_t s = batch_starts[i];
-        int64_t e = batch_starts[i + 1];
-        if (e > s) {
-          Tensor out = outs->Slice(s, e);
-          MultiClassOutput(ins_score, *boxes, all_indices[i], &out);
-        }
-      }
-    }
-
-    framework::LoD lod;
-    lod.emplace_back(batch_starts);
-
-    outs->set_lod(lod);
-  }
-};
-
-class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  MultiClassNMSOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("BBoxes",
-             "(Tensor) A 2-D Tensor with shape [M, 4] represents the "
-             "predicted locations of M bounding bboxes. Each bounding box "
-             "has four coordinate values and the layout is "
-             "[xmin, ymin, xmax, ymax].");
-    AddInput("Scores",
-             "(Tensor) A 3-D Tensor with shape [N, C, M] represents the "
-             "predicted confidence predictions. N is the batch size, C is the "
-             "class number, M is number of bounding boxes. For each category "
-             "there are total M scores which corresponding M bounding boxes. "
-             " Please note, M is equal to the 1st dimension of BBoxes. ");
-    AddAttr<int>(
-        "background_label",
-        "(int64_t, defalut: 0) "
-        "The index of background label, the background label will be ignored. "
-        "If set to -1, then all categories will be considered.")
-        .SetDefault(0);
-    AddAttr<float>("score_threshold",
-                   "(float) "
-                   "Threshold to filter out bounding boxes with low "
-                   "confidence score. If not provided, consider all boxes.");
-    AddAttr<int>("nms_top_k",
-                 "(int64_t) "
-                 "Maximum number of detections to be kept according to the "
-                 "confidences aftern the filtering detections based on "
-                 "score_threshold");
-    AddAttr<float>("nms_threshold",
-                   "(float, defalut: 0.3) "
-                   "The threshold to be used in NMS.")
-        .SetDefault(0.3);
-    AddAttr<float>("nms_eta",
-                   "(float) "
-                   "The parameter for adaptive NMS.")
-        .SetDefault(1.0);
-    AddAttr<int>("keep_top_k",
-                 "(int64_t) "
-                 "Number of total bboxes to be kept per image after NMS "
-                 "step. -1 means keeping all bboxes after NMS step.");
-    AddOutput("Out",
-              "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the "
-              "detections. Each row has 6 values: "
-              "[label, confidence, xmin, ymin, xmax, ymax], No is the total "
-              "number of detections in this mini-batch. For each instance, "
-              "the offsets in first dimension are called LoD, the number of "
-              "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
-              "no detected bbox.");
-    AddComment(R"DOC(
-This operator is to do multi-class non maximum suppression (NMS) on a batched
-of boxes and scores.
-
-In the NMS step, this operator greedily selects a subset of detection bounding
-boxes that have high scores larger than score_threshold, if providing this
-threshold, then selects the largest nms_top_k confidences scores if nms_top_k
-is larger than -1. Then this operator pruns away boxes that have high IOU
-(intersection over union) overlap with already selected boxes by adaptive
-threshold NMS based on parameters of nms_threshold and nms_eta.
-
-Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
-per image if keep_top_k is larger than -1.
-
-This operator support multi-class and batched inputs. It applying NMS
-independently for each class. The outputs is a 2-D LoDTenosr, for each
-image, the offsets in first dimension of LoDTensor are called LoD, the number
-of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
-means there is no detected bbox for this image. If there is no detected boxes
-for all images, all the elements in LoD are 0, and the Out only contains one
-value which is -1.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp,
-                  ops::MultiClassNMSOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(multiclass_nms, ops::MultiClassNMSKernel<float>,
-                       ops::MultiClassNMSKernel<double>);
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
deleted file mode 100644
index d275fa5cbbfbf4a949d7bb16c3acc598543ba000..0000000000000000000000000000000000000000
--- a/paddle/operators/multiplex_op.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/multiplex_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-class MultiplexOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Ids"), "Input(Ids) shouldn't be null.");
-    PADDLE_ENFORCE(!ctx->Inputs("X").empty(),
-                   "MultiInput(X) shouldn't be empty.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null.");
-    auto ids_dim = ctx->GetInputDim("Ids");
-    PADDLE_ENFORCE(
-        ids_dim.size() == 2 && ids_dim[1] == 1,
-        "The index tensor must be a vector with size batchSize x 1.");
-
-    auto ins_dims = ctx->GetInputsDim("X");
-    auto num_ins = ins_dims.size();
-    PADDLE_ENFORCE(num_ins > 1,
-                   "multiplex operator should have more than "
-                   "one candidate input tensors.");
-
-    auto in_dim = ins_dims[0];
-    PADDLE_ENFORCE(in_dim.size() >= 2,
-                   "The rank of candidate tensors must be not less than 2.");
-    for (size_t i = 1; i < num_ins; i++) {
-      auto dim = ins_dims[i];
-      PADDLE_ENFORCE(in_dim == dim,
-                     "All the candidate tensors must have the same size.");
-    }
-    ctx->SetOutputDim("Out", in_dim);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
-        ctx.device_context());
-  }
-};
-
-class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  MultiplexOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Ids", "The index tensor of multiplex operator.");
-    AddInput("X", "The candidate tensors of multiplex operator.")
-        .AsDuplicable();
-    AddOutput("Out", "The output tensor of multiplex operator.");
-    AddComment(R"DOC(
-Multiplex Operator.
-
-Multiplex multiple tensors according to the index provided by the index tensor.
-
-Ids: the index tensor.
-X[0 : N - 1]: the candidate tensors for output (N >= 2).
-For each index i from 0 to batchSize - 1, the output is the i-th row of the
-the (Ids[i])-th tensor.
-
-For i-th row of the output tensor:
-
-$$y[i] = x_{k}[i]$$
-
-where `y` is the output tensor, `x_{k}` is the k-th input tensor,
-and `k = Ids[i]`.
-
-)DOC");
-  }
-};
-
-class MultiplexGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(!ctx->Inputs("X").empty(), "Input(X) should not be null.");
-    PADDLE_ENFORCE(!ctx->Outputs(framework::GradVarName("X")).empty(),
-                   "Output(X@Grad) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null.");
-    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
-        ctx.device_context());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<false>);
-REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
-REGISTER_OP_CPU_KERNEL(
-    multiplex,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    multiplex_grad,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
deleted file mode 100644
index 546e6e7a24d3653e9904706eac51c1b833f51463..0000000000000000000000000000000000000000
--- a/paddle/operators/multiplex_op.cu
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/multiplex_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename Place, typename T>
-class MultiplexGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto ins = ctx.MultiInput<Tensor>("X");
-    auto* ids = ctx.Input<Tensor>("Ids");
-    auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto rows = ins[0]->dims()[0];
-    auto cols = ins[0]->numel() / rows;
-    // copy index to cpu
-    Tensor index_t_cpu;
-    Copy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
-    auto* index = index_t_cpu.data<int32_t>();
-    auto stream = ctx.cuda_device_context().stream();
-    platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    for (auto i = 0; i < rows; i++) {
-      int32_t k = index[i];
-      PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
-      PADDLE_ENFORCE_LT((size_t)k, ins.size(),
-                        "index exceeds the number of candidate tensors.");
-      memory::Copy(place, out->data<T>() + i * cols, place,
-                   ins[k]->data<T>() + i * cols, cols * sizeof(T), stream);
-    }
-  }
-};
-
-template <typename Place, typename T>
-class MultiplexGradGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto ins = ctx.MultiInput<Tensor>("X");
-    auto* ids = ctx.Input<Tensor>("Ids");
-    auto d_ins = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
-    for (size_t i = 0; i < d_ins.size(); i++) {
-      if (d_ins[i]) {
-        d_ins[i]->mutable_data<T>(ctx.GetPlace());
-        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
-        t.device(*ctx.template device_context<Place>().eigen_device()) =
-            t.constant(static_cast<T>(0));
-      }
-    }
-
-    auto rows = ins[0]->dims()[0];
-    auto cols = ins[0]->numel() / rows;
-    // copy index to cpu
-    Tensor index_t_cpu;
-    Copy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
-    auto* index = index_t_cpu.data<int32_t>();
-
-    auto stream = ctx.cuda_device_context().stream();
-    platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    for (auto i = 0; i < rows; i++) {
-      size_t k = static_cast<size_t>(index[i]);
-      if (d_ins[k]) {
-        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
-                     d_out->data<T>() + i * cols, cols * sizeof(T), stream);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    multiplex,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    multiplex_grad,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/operators/multiplex_op.h b/paddle/operators/multiplex_op.h
deleted file mode 100644
index ef66be5556ee613a037de13286ecc66b53885c1f..0000000000000000000000000000000000000000
--- a/paddle/operators/multiplex_op.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/memory/memcpy.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class MultiplexCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto ids = ctx.Input<framework::Tensor>("Ids");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto rows = ins[0]->dims()[0];
-    auto cols = ins[0]->numel() / rows;
-    auto index = ids->data<int32_t>();
-    platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
-    for (auto i = 0; i < rows; i++) {
-      int32_t k = index[i];
-      PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
-      PADDLE_ENFORCE_LT(static_cast<size_t>(k), ins.size(),
-                        "index exceeds the number of candidate tensors.");
-      memory::Copy(place, out->data<T>() + i * cols, place,
-                   ins[k]->data<T>() + i * cols, cols * sizeof(T));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MultiplexGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* ids = ctx.Input<framework::Tensor>("Ids");
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto d_ins =
-        ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
-    for (size_t i = 0; i < d_ins.size(); i++) {
-      if (d_ins[i]) {
-        d_ins[i]->mutable_data<T>(ctx.GetPlace());
-        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
-        t.device(*ctx.template device_context<DeviceContext>().eigen_device()) =
-            t.constant(static_cast<T>(0));
-      }
-    }
-
-    auto rows = ins[0]->dims()[0];
-    auto cols = ins[0]->numel() / rows;
-    auto* index = ids->data<int32_t>();
-    platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
-    for (auto i = 0; i < rows; i++) {
-      size_t k = static_cast<size_t>(index[i]);
-      if (d_ins[k]) {
-        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
-                     d_out->data<T>() + i * cols, cols * sizeof(T));
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/nccl/nccl_gpu_common.cc b/paddle/operators/nccl/nccl_gpu_common.cc
deleted file mode 100644
index 1602a3d9b54dd64813770a7162f8d4f3dd0e791a..0000000000000000000000000000000000000000
--- a/paddle/operators/nccl/nccl_gpu_common.cc
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/nccl/nccl_gpu_common.h"
-#include "paddle/platform/gpu_info.h"
-
-namespace paddle {
-namespace platform {}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
deleted file mode 100644
index 5173996f2020ec7a94643277e8c7a532d41d9045..0000000000000000000000000000000000000000
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <condition_variable>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/dynload/nccl.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/macros.h"
-
-namespace paddle {
-namespace platform {
-
-constexpr int kInvalidGPUId = -1;
-
-struct Communicator {
-  std::vector<ncclComm_t> comms_;
-  std::unordered_map<int, int> comm_id_map_;
-  bool inited_;
-
-  Communicator() {}
-
-  int GetCommId(int device_id) const { return comm_id_map_.at(device_id); }
-
-  void InitAll(const std::vector<int>& gpus) {
-    comms_.resize(gpus.size());
-    inited_ = false;
-    for (size_t i = 0; i < gpus.size(); ++i) {
-      comm_id_map_[gpus[i]] = i;
-    }
-    PADDLE_ENFORCE(
-        dynload::ncclCommInitAll(comms_.data(), gpus.size(), gpus.data()));
-    inited_ = true;
-  }
-
-  ~Communicator() {
-    if (inited_) {
-      for (size_t i = 0; i < comms_.size(); ++i) {
-        // FIXME(dzh) : PADDLE_ENFORCE return void
-        dynload::ncclCommDestroy(comms_[i]);
-      }
-    }
-  }
-
-  DISABLE_COPY_AND_ASSIGN(Communicator);
-};
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
deleted file mode 100644
index ae912d7f362efc82b50948791d7f6e81cb997d14..0000000000000000000000000000000000000000
--- a/paddle/operators/nccl_op.cc
+++ /dev/null
@@ -1,252 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/framework/framework.pb.h>
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/nccl/nccl_gpu_common.h"
-
-namespace paddle {
-namespace operators {
-
-static constexpr char kParallelScopes[] = "parallel_scopes";
-
-// NCCLinitOp
-class NCCLInitOp : public framework::OperatorBase {
- public:
-  NCCLInitOp(const std::string &type, const framework::VariableNameMap &inputs,
-             const framework::VariableNameMap &outputs,
-             const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kParallelScopes)),
-                            "Can not find variable '%s' in the scope.",
-                            kParallelScopes);
-    const auto &name = Output("Communicator");
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
-                            "Can not find variable '%s' in the scope.", name);
-    // A parallel do may not use all the gpus. For example, the batch size is 7
-    // in the last batch while we have 8 gpu. In this case, parallel_do will
-    // create 7 parallel scopes, so should ncclInitOp create 7 gpu peers
-    auto &parallel_scopes = scope.FindVar(Input(kParallelScopes))
-                                ->Get<std::vector<framework::Scope *>>();
-    std::vector<int> gpus(parallel_scopes.size());
-    for (int i = 0; i < static_cast<int>(parallel_scopes.size()); ++i) {
-      gpus[i] = i;
-    }
-    PADDLE_ENFORCE(!gpus.empty(), "NCCL init with 0 gpus.");
-
-    if (scope.FindVar(name) == nullptr) {
-      PADDLE_THROW("Output(Communicator) is needed for ncclInit operator.");
-    }
-
-    platform::Communicator *comm =
-        scope.FindVar(name)->GetMutable<platform::Communicator>();
-    comm->InitAll(gpus);
-  }
-};
-
-class NCCLInitOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto out_var_name = op_desc.Output("Communicator").front();
-    auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarDesc::NCCL_COM;
-    out_var.SetType(var_type);
-  }
-};
-
-class NCCLInitOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  NCCLInitOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(kParallelScopes, "The working place of parallel do.");
-    AddOutput("Communicator",
-              "Create Communicator for communicating between gpus");
-    AddComment(R"DOC(
-NCCLInit Operator.
-
-Create communicator.
-
-)DOC");
-  }
-};
-
-// AllReduceOp
-class NCCLAllReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   " Input(X) of AllReduce op input should not be NULL");
-    PADDLE_ENFORCE(
-        ctx->HasInput("Communicator"),
-        " Input(Communicator) of AllReduce op input should not be NULL");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   " Output(Out) of AllReduce op output should not be NULL");
-
-    auto x_dims = ctx->GetInputsDim("X");
-
-    std::string reduction = ctx->Attrs().Get<std::string>("reduction");
-    PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
-                    reduction == "ncclMin" || reduction == "ncclMax"),
-                   "invalid reduction.");
-
-    ctx->SetOutputsDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-// ReduceOp
-class NCCLReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   " Input(X) of Reduce op input should not be NULL");
-    PADDLE_ENFORCE(
-        ctx->HasInput("Communicator"),
-        " Input(Communicator) of Reduce op input should not be NULL");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   " Input(X) of Reduce op input should not be NULL");
-
-    std::string reduction = ctx->Attrs().Get<std::string>("reduction");
-    PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
-                    reduction == "ncclMin" || reduction == "ncclMax"),
-                   "invalid reduction.");
-
-    auto x_dims = ctx->GetInputsDim("X");
-    ctx->SetOutputsDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-// BcastOp
-class NCCLBcastOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   " Input(X) of Bcast op input should not be NULL");
-    PADDLE_ENFORCE(ctx->HasInput("Communicator"),
-                   " Input(Communicator) of Bcast op input should not be NULL");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   " Output(Out) of Bcast op output should not be NULL");
-
-    int root = ctx->Attrs().Get<int>("root");
-    PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set.");
-
-    auto x_dims = ctx->GetInputsDim("X");
-    ctx->SetOutputsDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-// AllreduceOp
-class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  NCCLAllReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input of AllReduce op");
-    AddInput("Communicator", "Communicator for communicating between gpus");
-    AddOutput("Out", "The output of AllReduce op");
-    AddAttr<std::string>("reduction",
-                         "(string, default 'ncclSum') "
-                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
-        .SetDefault("ncclSum");
-    AddComment(R"DOC(
-NCCLAllReduce Operator.
-
-AllReduce the input tensors.
-
-)DOC");
-  }
-};
-
-// ReduceOp
-class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  NCCLReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input of Reduce op");
-    AddInput("Communicator", "Communicator for communicating between gpus");
-    AddOutput("Out", "The output of Reduce op");
-    AddAttr<std::string>("reduction",
-                         "(string, default 'ncclSum') "
-                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
-        .SetDefault("ncclSum");
-    AddAttr<int>("root",
-                 "(int, default kInvalidGPUId) "
-                 "Root gpu of the parameter. If not, "
-                 "set(platform::kInvalidGPUId). Hashed by name.")
-        .SetDefault(platform::kInvalidGPUId);
-    AddComment(R"DOC(
-NCCLReduce Operator.
-
-Reduce the tensors.
-
-)DOC");
-  }
-};
-
-// BcastOp
-class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  NCCLBcastOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input of BcastSend op");
-    AddInput("Communicator", "Communicator for communicating between gpus");
-    AddOutput("Out", "The output of Bcast");
-    AddAttr<int>("root",
-                 "(int, default kInvalidGPUId) "
-                 "Root gpu of the parameter. If not, "
-                 "set(platform::kInvalidGPUId). Hashed by name.")
-        .SetDefault(platform::kInvalidGPUId);
-    AddComment(R"DOC(
-NCCLBcast Operator.
-
-Bcast the tensors.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(ncclInit, ops::NCCLInitOp,
-                  paddle::framework::EmptyGradOpMaker, ops::NCCLInitOpMaker,
-                  ops::NCCLInitOpVarTypeInference,
-                  ops::NCCLInitOpShapeInference);
-
-REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp,
-                             ops::NCCLAllReduceOpMaker);
-REGISTER_OP_WITHOUT_GRADIENT(ncclBcast, ops::NCCLBcastOp,
-                             ops::NCCLBcastOpMaker);
-REGISTER_OP_WITHOUT_GRADIENT(ncclReduce, ops::NCCLReduceOp,
-                             ops::NCCLReduceOpMaker);
diff --git a/paddle/operators/nccl_op.cu.cc b/paddle/operators/nccl_op.cu.cc
deleted file mode 100644
index b6db63ac6aed26b5dc7c4c19775c876bb8e7ca91..0000000000000000000000000000000000000000
--- a/paddle/operators/nccl_op.cu.cc
+++ /dev/null
@@ -1,217 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenseshashernless required by applicable law or agreed
-to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <functional>
-
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/nccl/nccl_gpu_common.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using platform::Communicator;
-using framework::LoDTensor;
-
-template <typename Type>
-class NCCLTypeWrapper;
-
-template <>
-class NCCLTypeWrapper<float> {
- public:
-  static const ncclDataType_t type = ncclFloat;
-};
-
-template <>
-class NCCLTypeWrapper<double> {
- public:
-  static const ncclDataType_t type = ncclDouble;
-};
-
-template <typename T>
-class NCCLAllReduceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-
-    auto ins = ctx.MultiInput<LoDTensor>("X");
-    auto outs = ctx.MultiOutput<LoDTensor>("Out");
-
-    LOG(INFO) << "------------------";
-    std::string reduction = ctx.Attr<std::string>("reduction");
-    LOG(INFO) << "------------------";
-    ncclRedOp_t reduction_op_ = ncclSum;
-    LOG(INFO) << "------------------";
-
-    if (reduction == "ncclMin") {
-      reduction_op_ = ncclMin;
-    } else if (reduction == "ncclMax") {
-      reduction_op_ = ncclMax;
-    } else if (reduction == "ncclSum") {
-      reduction_op_ = ncclSum;
-    } else if (reduction == "ncclProd") {
-      reduction_op_ = ncclProd;
-    } else {
-      PADDLE_THROW("Invalid reduction. default ncclSum.");
-    }
-
-    LOG(INFO) << "------------------";
-    auto* comm = ctx.Input<Communicator>("Communicator");
-
-    LOG(INFO) << "------------------";
-    auto stream = ctx.cuda_device_context().stream();
-
-    LOG(INFO) << "------------------";
-    // device id
-    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
-    LOG(INFO) << "------------------";
-    int idx = comm->GetCommId(gpu_id);
-
-    LOG(INFO) << "------------------";
-    for (size_t i = 0; i < ins.size(); ++i) {
-      VLOG(1) << "gpu : "
-              << " invoke allreduce. send " << ins[i]->numel() << " recv "
-              << outs[i]->numel();
-
-      PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
-          ins[i]->data<T>(), outs[i]->mutable_data<T>(ctx.GetPlace()),
-          outs[i]->numel(), NCCLTypeWrapper<T>::type, reduction_op_,
-          comm->comms_[idx], stream));
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
-
-      VLOG(1) << "gpu : "
-              << " finished allreduce. send " << ins[i]->numel() << " recv "
-              << outs[i]->numel();
-    }
-  }
-};
-
-template <typename T>
-class NCCLReduceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-
-    auto ins = ctx.MultiInput<LoDTensor>("X");  // x0, x1, x2
-    auto outs = ctx.MultiOutput<LoDTensor>("Out");
-
-    std::string reduction = ctx.Attr<std::string>("reduction");
-    ncclRedOp_t reduction_op_ = ncclSum;
-
-    if (reduction == "ncclMin") {
-      reduction_op_ = ncclMin;
-    } else if (reduction == "ncclMax") {
-      reduction_op_ = ncclMax;
-    } else if (reduction == "ncclSum") {
-      reduction_op_ = ncclSum;
-    } else if (reduction == "ncclProd") {
-      reduction_op_ = ncclProd;
-    } else {
-      PADDLE_THROW("Invalid reduction. default ncclSum.");
-    }
-
-    int root = ctx.Attr<int>("root");
-    auto* comm = ctx.Input<Communicator>("Communicator");
-
-    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
-                      ctx.device_context())
-                      .stream();
-    // device id
-    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
-    int idx = comm->GetCommId(gpu_id);
-
-    auto ins_names = ctx.Inputs("X");
-    std::hash<std::string> hasher;
-    for (size_t i = 0; i < ins.size(); ++i) {
-      if (root == platform::kInvalidGPUId) {
-        root = hasher(ins_names[i]) % comm->comms_.size();
-      }
-      T* recvbuffer = nullptr;
-      if (root == gpu_id) {
-        recvbuffer = outs[i]->mutable_data<T>(ctx.GetPlace());
-      }
-
-      VLOG(1) << "gpu : " << gpu_id << " invoke reduce. send "
-              << ins[i]->numel() << " recv " << outs[i]->numel();
-
-      PADDLE_ENFORCE(platform::dynload::ncclReduce(
-          ins[i]->data<T>(), recvbuffer, ins[i]->numel(),
-          NCCLTypeWrapper<T>::type, reduction_op_, root, comm->comms_[idx],
-          stream));
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
-
-      VLOG(1) << "gpu : " << gpu_id << " finished reduce. send "
-              << ins[i]->numel() << " recv " << outs[i]->numel();
-    }
-  }
-};
-
-template <typename T>
-class NCCLBcastKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-
-    int root = ctx.Attr<int>("root");
-
-    auto* comm = ctx.Input<Communicator>("Communicator");
-
-    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
-                      ctx.device_context())
-                      .stream();
-    // device id
-    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
-    int idx = comm->GetCommId(gpu_id);
-
-    if (idx == root) {
-      auto ins = ctx.MultiInput<LoDTensor>("X");
-      for (size_t i = 0; i < ins.size(); ++i) {
-        VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. send "
-                << ins[i]->numel();
-
-        VLOG(1) << " before ncclBcast";
-        PADDLE_ENFORCE(platform::dynload::ncclBcast(
-            (void*)ins[i]->data<T>(), ins[i]->numel(), NCCLTypeWrapper<T>::type,
-            root, comm->comms_[idx], stream));
-        VLOG(1) << " after ncclBcast";
-        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
-
-        VLOG(1) << "gpu : " << gpu_id << " finished Bcast.";
-      }
-    } else {
-      auto outs = ctx.MultiOutput<LoDTensor>("Out");
-      for (size_t i = 0; i < outs.size(); ++i) {
-        VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
-                << framework::product(outs[i]->dims());
-
-        PADDLE_ENFORCE(platform::dynload::ncclBcast(
-            outs[i]->mutable_data<T>(ctx.GetPlace()), outs[i]->numel(),
-            NCCLTypeWrapper<T>::type, root, comm->comms_[idx], stream));
-        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
-
-        VLOG(1) << "gpu : " << gpu_id << " finished Bcast. recv "
-                << outs[i]->numel();
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
-REGISTER_OP_CUDA_KERNEL(ncclBcast, ops::NCCLBcastKernel<float>);
-REGISTER_OP_CUDA_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);
diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc
deleted file mode 100644
index 827a62534778e48c8d4f03d2634056b7d1392ae8..0000000000000000000000000000000000000000
--- a/paddle/operators/nccl_op_test.cu.cc
+++ /dev/null
@@ -1,318 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <memory>
-#include <mutex>
-#include <thread>
-#include <utility>
-#include <vector>
-
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/init.h"
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/program_desc.h"
-#include "paddle/framework/var_desc.h"
-#include "paddle/operators/nccl/nccl_gpu_common.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/gpu_info.h"
-#include "paddle/platform/place.h"
-
-USE_NO_KERNEL_OP(ncclInit);
-USE_CUDA_ONLY_OP(ncclAllReduce);
-USE_CUDA_ONLY_OP(ncclReduce);
-USE_CUDA_ONLY_OP(ncclBcast);
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-static std::vector<int> gpu_list;
-
-// test data amount
-const f::DDim kDims = {100, 100};
-
-// nccl op common tester, init communicator.
-class NCCLTester : public ::testing::Test {
- public:
-  virtual void SetUp() override {
-    paddle::platform::CPUPlace cpu_place;
-    for (size_t i = 0; i < gpu_list.size(); ++i) {
-      p::CUDAPlace place(i);
-      dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
-    }
-
-    NCCLInitOp();
-  }
-
-  virtual void TearDown() override {
-    for (auto &device_context : dev_ctxs) {
-      delete device_context;
-    }
-  }
-
-  void NCCLInitOp() {
-    paddle::platform::CPUPlace cpu_place;
-    std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
-
-    op1->SetType("ncclInit");
-    op1->SetOutput("Communicator", {"comm"});
-    op1->SetAttr("gpus", {gpu_list});
-
-    auto *var = g_scope.Var("comm");
-    var->GetMutable<p::Communicator>();
-
-    auto op = f::OpRegistry::CreateOp(*op1);
-    VLOG(1) << "invoke NCCLInitOp.";
-    op->Run(g_scope, cpu_place);
-    VLOG(1) << "NCCLInitOp finished.";
-  }
-
-  template <class T>
-  void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
-    std::unique_lock<std::mutex> lk(mu);
-    const f::OpDesc *op1 = &op_desc;
-
-    p::CUDAPlace place(gpu_id);
-    auto &ctx = dev_ctxs.at(gpu_id);
-
-    auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
-    auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>();
-
-    if (!send_tensor->numel()) {
-      send_tensor->Resize(kDims);
-      send_tensor->mutable_data<T>(kDims, place);
-
-      std::vector<T> send_vector(f::product(kDims), gpu_id);
-      paddle::framework::CopyFromVector<T>(send_vector, *ctx, send_tensor);
-      ctx->Wait();
-      VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
-    }
-
-    lk.unlock();
-
-    PADDLE_ENFORCE(send_tensor->numel() == f::product(kDims),
-                   "Tensor numel not match!");
-
-    auto op = f::OpRegistry::CreateOp(*op1);
-
-    VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
-    VLOG(1) << " send_tensor : " << send_tensor->numel()
-            << " recv_tensor : " << recv_tensor->numel();
-    op->Run(*scope, place);
-    VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
-  }
-
- public:
-  std::vector<p::DeviceContext *> dev_ctxs;
-  f::Scope g_scope;
-  std::mutex mu;
-};
-
-// ncclInitOp with desc
-TEST(NCCL, ncclInitOp) {
-  std::unique_ptr<f::OpDesc> op_desc(new f::OpDesc);
-
-  op_desc->SetType("ncclInit");
-  op_desc->SetOutput("Communicator", {"x1"});
-  op_desc->SetAttr("gpus", {gpu_list});
-
-  f::Scope g_scope;
-  paddle::platform::CPUPlace cpu_place;
-
-  auto *var = g_scope.Var("x1");
-  var->GetMutable<p::Communicator>();
-
-  auto op = f::OpRegistry::CreateOp(*op_desc);
-  VLOG(1) << "invoke NCCLInitOp.";
-  op->Run(g_scope, cpu_place);
-  VLOG(1) << "NCCLInitOp finished.";
-}
-
-// ncclAllReduceOp with desc
-TEST_F(NCCLTester, ncclAllReduceOp) {
-  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
-  op2->SetType("ncclAllReduce");
-  op2->SetInput("X", {"st"});
-  op2->SetInput("Communicator", {"comm"});
-  op2->SetOutput("Out", {"rt"});
-
-  std::vector<f::Scope *> dev_scopes;
-
-  std::vector<std::thread> ths;
-
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    dev_scopes.emplace_back(&g_scope.NewScope());
-    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
-                   *op2.get(), dev_scopes[i]);
-    ths.emplace_back(std::move(th));
-  }
-
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    ths[i].join();
-  }
-
-  // check results
-  float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
-
-  for (size_t i = 0; i < dev_scopes.size(); ++i) {
-    p::CPUPlace cpu_place;
-    p::CUDAPlace gpu_place(gpu_list[i]);
-
-    auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
-    auto *rt = recv_tensor.data<float>();
-    auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
-    result_tensor->Resize(kDims);
-    auto *ct = result_tensor->mutable_data<float>(cpu_place);
-
-    paddle::memory::Copy(
-        cpu_place, ct, p::CUDAPlace(gpu_list[i]), rt,
-        recv_tensor.numel() * sizeof(float),
-        static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
-
-    for (int64_t j = 0; j < f::product(kDims); ++j) {
-      ASSERT_NEAR(ct[j], result, 1e-5);
-    }
-  }
-}
-
-// ncclReduceOp with desc
-TEST_F(NCCLTester, ncclReduceOp) {
-  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
-  const int kRoot = 0;
-  op2->SetType("ncclReduce");
-  op2->SetInput("X", {"st"});
-  op2->SetInput("Communicator", {"comm"});
-  op2->SetOutput("Out", {"rt"});
-  op2->SetAttr("root", kRoot);
-
-  std::vector<f::Scope *> dev_scopes;
-
-  std::vector<std::thread> ths;
-
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    dev_scopes.emplace_back(&g_scope.NewScope());
-    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
-                   *op2.get(), dev_scopes[i]);
-    ths.emplace_back(std::move(th));
-  }
-
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    ths[i].join();
-  }
-
-  // check results on
-  float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
-
-  p::CPUPlace cpu_place;
-  p::CUDAPlace gpu_place(gpu_list[kRoot]);
-
-  auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
-  auto *rt = recv_tensor.data<float>();
-  auto *result_tensor =
-      dev_scopes[kRoot]->Var("ct")->GetMutable<f::LoDTensor>();
-  result_tensor->Resize(kDims);
-  auto *ct = result_tensor->mutable_data<float>(cpu_place);
-
-  paddle::memory::Copy(
-      cpu_place, ct, p::CUDAPlace(gpu_list[kRoot]), rt,
-      recv_tensor.numel() * sizeof(float),
-      static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
-
-  for (int64_t j = 0; j < f::product(kDims); ++j) {
-    ASSERT_NEAR(ct[j], result, 1e-5);
-  }
-}
-
-// ncclBcastOp with desc
-TEST_F(NCCLTester, ncclBcastOp) {
-  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
-  const int kRoot = 0;
-  op2->SetType("ncclBcast");
-  op2->SetInput("X", {"st"});
-  op2->SetInput("Communicator", {"comm"});
-  op2->SetOutput("Out", {"rt"});
-  op2->SetAttr("root", kRoot);
-
-  std::vector<f::Scope *> dev_scopes;
-
-  std::vector<std::thread> ths;
-
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    dev_scopes.emplace_back(&g_scope.NewScope());
-    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
-                   *op2.get(), dev_scopes[i]);
-    ths.emplace_back(std::move(th));
-  }
-
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    ths[i].join();
-  }
-
-  const int idx = 1;
-  // check results on
-  float result = kRoot;
-
-  p::CPUPlace cpu_place;
-  p::CUDAPlace gpu_place(gpu_list[idx]);
-
-  auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
-  auto *rt = recv_tensor.data<float>();
-  auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
-  result_tensor->Resize(kDims);
-  auto *ct = result_tensor->mutable_data<float>(cpu_place);
-
-  paddle::memory::Copy(
-      cpu_place, ct, p::CUDAPlace(gpu_list[idx]), rt,
-      recv_tensor.numel() * sizeof(float),
-      static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
-
-  for (int64_t j = 0; j < f::product(kDims); ++j) {
-    ASSERT_NEAR(ct[j], result, 1e-5);
-  }
-}
-
-int main(int argc, char **argv) {
-  // FIXME(tonyyang-svail):
-  //   Due to the driver issue on our CI, disable for now
-  return 0;
-  const int dev_count = p::GetCUDADeviceCount();
-  if (dev_count <= 1) {
-    LOG(WARNING)
-        << "Cannot test multi-gpu nccl, because the CUDA device count is "
-        << dev_count;
-    return 0;
-  }
-
-  std::vector<paddle::platform::Place> places;
-
-  places.emplace_back(paddle::platform::CPUPlace());
-  int count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < count; ++i) {
-    places.emplace_back(paddle::platform::CUDAPlace(i));
-    gpu_list.emplace_back(i);
-  }
-
-  VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Init(places);
-
-  testing::InitGoogleTest(&argc, argv);
-
-  // device context should be release before scope.
-  // otherwise driver will down.
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/operators/nce_op.cc b/paddle/operators/nce_op.cc
deleted file mode 100644
index 994ddf717e7a5b883d8071c6a47da0b4b4074f2e..0000000000000000000000000000000000000000
--- a/paddle/operators/nce_op.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/nce_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class NCEOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"));
-    PADDLE_ENFORCE(ctx->HasInput("Label"));
-    PADDLE_ENFORCE(ctx->HasInput("Weight"));
-    PADDLE_ENFORCE(ctx->HasOutput("Cost"));
-    PADDLE_ENFORCE(ctx->HasOutput("SampleLogits"));
-    PADDLE_ENFORCE(ctx->HasOutput("SampleLabels"));
-
-    auto x_dims = ctx->GetInputDim("Input");
-    auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]);
-    int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1;
-    if (ctx->HasInput("Bias")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Weight")[0],
-                        ctx->GetInputDim("Bias")[0]);
-    }
-    auto num_neg_samples = ctx->Attrs().Get<int>("num_neg_samples");
-    auto num_total_classes = ctx->Attrs().Get<int>("num_total_classes");
-    std::vector<int> custom_neg_classes =
-        ctx->Attrs().Get<std::vector<int>>("custom_neg_classes");
-    PADDLE_ENFORCE_EQ(num_total_classes, ctx->GetInputDim("Weight")[0]);
-    if (custom_neg_classes.size() > 0) {
-      PADDLE_ENFORCE_EQ(custom_neg_classes.size(),
-                        static_cast<size_t>(num_neg_samples));
-    }
-    // set dims of output(Out)
-    std::vector<int64_t> out_dims;
-    out_dims.push_back(x_dims[0]);
-    out_dims.push_back(1);
-    ctx->SetOutputDim("Cost", framework::make_ddim(out_dims));
-
-    // set dims of output(SampleOut)
-    std::vector<int64_t> sample_out_dims;
-    sample_out_dims.push_back(x_dims[0]);
-    sample_out_dims.push_back(num_neg_samples + num_true_classes);
-    ctx->SetOutputDim("SampleLogits", framework::make_ddim(sample_out_dims));
-    ctx->SetOutputDim("SampleLabels", framework::make_ddim(sample_out_dims));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
-        ctx.GetPlace());
-  }
-};
-
-class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  NCEOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Input", "(Tensor) A tensor of shape [batch_size, dim].");
-    AddInput(
-        "Label",
-        "(Tensor) A tensor of shape [batch_size, num_true_class]. "
-        "'num_true_class' is the number of target classes in each sample."
-        "The number of target classes per sample should be same. "
-        "If you have a variable number of target classes, "
-        "you can pad them out to a constant number by either repeating them"
-        " or by padding with an otherwise unused class.)");
-    AddInput("Weight",
-             "(Tensor) A tensor of shape [num_class, dim]. 'num_class' is the "
-             "total number of class.");
-    AddInput(
-        "Bias",
-        "(Tensor) A tensor of shape [num_class, 1]. 'num_class' is the total "
-        "number of class. It is a dispensable input.")
-        .AsDispensable();
-    AddInput("SampleWeight",
-             "(Tensor) A tensor of shape [batch_size, 1] storing a weight for "
-             "each sample. And it is a dispensable input. The default value of "
-             "sample is 1.")
-        .AsDispensable();
-    AddOutput("Cost",
-              "(Tensor) A tensor of shape [batch_size, 1]. Cost of samples.");
-    AddOutput("SampleLogits",
-              "An intermediate tensor of shape[batch_size, num_neg_samples + "
-              "num_pos_samples]."
-              "This tensor is output of forward kernel and used in backward "
-              "kernel to compute grads."
-              "Given X is  the dot product of input tensor and sampled labels' "
-              "weights."
-              "Then 'SampleLogits' is sigmoid(X).")
-        .AsIntermediate();
-    AddOutput("SampleLabels",
-              "An intermediate tensor of shape[batch_size, num_neg_samples + "
-              "num_pos_samples]."
-              "This tensor is output of forward kernel and used in backward "
-              "kernel to compute grads."
-              "")
-        .AsIntermediate();
-    AddAttr<int>("num_total_classes",
-                 "Total number of classes in all samples.");
-    AddAttr<int>("num_neg_samples",
-                 "The number of negative classes. The default value is 10.")
-        .SetDefault(10);
-    AddAttr<std::vector<int>>("custom_neg_classes",
-                              "This attribute only be used in unitest. Classes "
-                              "in this list wiil be used as negative classes "
-                              "for every samples. Under normal conditions, "
-                              "user should avoid setting this attribute.")
-        .SetDefault({});
-    AddComment(R"DOC(
-Compute and return the noise-contrastive estimation training loss.
-See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
-By default this operator uses a uniform distribution for sampling.
-)DOC");
-  }
-};
-
-class NCEOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"));
-    PADDLE_ENFORCE(ctx->HasInput("Weight"));
-    PADDLE_ENFORCE(ctx->HasInput("Cost"));
-    PADDLE_ENFORCE(ctx->HasInput("SampleLogits"));
-    PADDLE_ENFORCE(ctx->HasInput("SampleLabels"));
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Cost")),
-                   "The input(Out@GRAD) should not be null.");
-
-    auto x_dims = ctx->GetInputDim("Input");
-    auto x_grad_name = framework::GradVarName("Input");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-
-    auto w_dims = ctx->GetInputDim("Weight");
-    auto w_grad_name = framework::GradVarName("Weight");
-    if (ctx->HasOutput(w_grad_name)) {
-      ctx->SetOutputDim(w_grad_name, w_dims);
-    }
-
-    auto bias_grad_name = framework::GradVarName("Bias");
-    if (ctx->HasOutput(bias_grad_name)) {
-      auto bias_dims = ctx->GetInputDim("Bias");
-      ctx->SetOutputDim(bias_grad_name, bias_dims);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
-        ctx.GetPlace());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(nce, ops::NCEOp, ops::NCEOpMaker, nce_grad, ops::NCEOpGrad);
-REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel<paddle::platform::CPUPlace, float>,
-                       ops::NCEKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(nce_grad,
-                       ops::NCEGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::NCEGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/nce_op.h b/paddle/operators/nce_op.h
deleted file mode 100644
index 86fa13a649ce7fdcaad64e2609ceea2fb4d7e072..0000000000000000000000000000000000000000
--- a/paddle/operators/nce_op.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <math.h>
-#include <random>
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-void PrepareSamples(const framework::ExecutionContext& context) {
-  auto label = context.Input<Tensor>("Label");
-  const int64_t* label_data = label->data<int64_t>();
-  auto label_dims = label->dims();
-  int num_total_classes = context.Attr<int>("num_total_classes");
-  // for unitest
-  std::vector<int> custom_neg_classes =
-      context.Attr<std::vector<int>>("custom_neg_classes");
-  // random machine
-  std::random_device rd;
-  std::mt19937 rng(rd());
-  std::uniform_int_distribution<int> rand(0, num_total_classes - 1);
-
-  auto sample_labels = context.Output<Tensor>("SampleLabels");
-  auto sample_labels_dims = sample_labels->dims();
-  int64_t* sample_labels_data =
-      sample_labels->mutable_data<int64_t>(context.GetPlace());
-
-  int num_label = label_dims.size() == 2 ? label_dims[1] : 1;
-  int index = 0;
-  for (int64_t i = 0; i < label_dims[0]; ++i) {
-    int j = 0;
-    for (; j < num_label; ++j) {
-      sample_labels_data[index++] = label_data[i * num_label + j];
-    }
-    if (custom_neg_classes.size() > 0) {
-      for (auto label : custom_neg_classes) {
-        sample_labels_data[index++] = label;
-      }
-    } else {
-      for (; j < sample_labels_dims[1]; ++j) {
-        // TODO(wanghaoshuang): support more distribution sampling
-        sample_labels_data[index++] = rand(rng);
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class NCEKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PrepareSamples<DeviceContext, T>(context);
-    auto sample_labels = context.Output<Tensor>("SampleLabels");
-    const int64_t* sample_labels_data = sample_labels->data<int64_t>();
-    auto sample_out = context.Output<Tensor>("SampleLogits");
-    T* sample_out_data = sample_out->mutable_data<T>(context.GetPlace());
-    auto label = context.Input<Tensor>("Label");
-    auto sample_weight = context.Input<Tensor>("SampleWeight");
-    const T* sample_weight_data = nullptr;
-    if (sample_weight != nullptr) {
-      sample_weight_data = sample_weight->data<T>();
-    }
-    auto out = context.Output<Tensor>("Cost");
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    int num_neg_samples = context.Attr<int>("num_neg_samples");
-    int num_total_classes = context.Attr<int>("num_total_classes");
-    int64_t num_true_class = 1;
-    if (label != nullptr) {
-      num_true_class = label->dims()[1];
-    }
-    T b = 1. / num_total_classes * num_neg_samples;
-    // forward bias
-    auto bias = context.Input<Tensor>("Bias");
-    if (bias != nullptr) {
-      const T* bias_data = bias->data<T>();
-      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        sample_out_data[i] = bias_data[sample_labels_data[i]];
-      }
-    } else {
-      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        sample_out_data[i] = 0;
-      }
-    }
-    // forward mul
-    auto input_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
-    auto weight_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
-    for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-      Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
-          (input_mat.chip((int)(i / sample_labels->dims()[1]), 0) *
-           weight_mat.chip(sample_labels_data[i], 0))
-              .sum();
-      sample_out_data[i] += result(0);
-      sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
-    }
-    // forward cost
-    for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) {
-      int64_t j = 0;
-      out_data[i] = 0;
-      T w = sample_weight == nullptr ? 1. : sample_weight_data[i];
-      // for true classes
-      for (; j < num_true_class; ++j) {
-        T o = sample_out_data[i * sample_out->dims()[1] + j];
-        T cost = -log(o / (o + b));
-        out_data[i] += w * cost;
-      }
-      // for sampled neg classes
-      for (; j < sample_labels->dims()[1]; ++j) {
-        T o = sample_out_data[i * sample_out->dims()[1] + j];
-        T cost = -log(b / (o + b));
-        out_data[i] += w * cost;
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class NCEGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto d_out = context.Input<Tensor>(framework::GradVarName("Cost"));
-    const T* d_out_data = d_out->data<T>();
-    auto label = context.Input<Tensor>("Label");
-    auto sample_out = context.Input<Tensor>("SampleLogits");
-    const T* sample_out_data = sample_out->data<T>();
-    auto sample_labels = context.Input<Tensor>("SampleLabels");
-    const int64_t* sample_labels_data = sample_labels->data<int64_t>();
-    auto sample_weight = context.Input<Tensor>("SampleWeight");
-    const T* sample_weight_data = nullptr;
-    if (sample_weight != nullptr) {
-      sample_weight_data = sample_weight->data<T>();
-    }
-    int num_neg_samples = context.Attr<int>("num_neg_samples");
-    int num_total_classes = context.Attr<int>("num_total_classes");
-    int num_true_class = 1;
-    if (label != nullptr) {
-      num_true_class = label->dims()[1];
-    }
-    T b = 1. / num_total_classes * num_neg_samples;
-    Tensor sample_grad;  // tmp tensor
-    T* sample_grad_data =
-        sample_grad.mutable_data<T>(sample_labels->dims(), context.GetPlace());
-    // backward cost
-    for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-      T o = sample_out_data[i];
-      T w = sample_weight == nullptr
-                ? 1
-                : sample_weight_data[i / sample_labels->dims()[1]];
-      sample_grad_data[i] = (i % sample_labels->dims()[1]) < num_true_class
-                                ? w * (b / (o + b)) * (o - 1)
-                                : w * (o * (1 - o) / (o + b));
-      sample_grad_data[i] *= d_out_data[i / sample_labels->dims()[1]];
-    }
-    // get d_bias
-    auto d_bias = context.Output<Tensor>(framework::GradVarName("Bias"));
-    if (d_bias != nullptr) {
-      T* d_bias_data = d_bias->mutable_data<T>(context.GetPlace());
-      std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0);
-      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        d_bias_data[sample_labels_data[i]] += sample_grad_data[i];
-      }
-    }
-    // get d_w
-    auto d_w = context.Output<Tensor>(framework::GradVarName("Weight"));
-    if (d_w != nullptr) {
-      auto d_w_data = d_w->mutable_data<T>(context.GetPlace());
-      std::fill(d_w_data, d_w_data + d_w->numel(), 0.0);
-      auto d_w_matrix = EigenMatrix<T>::From(*d_w);
-      auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
-      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        d_w_matrix.chip(sample_labels_data[i], 0) +=
-            x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) *
-            sample_grad_data[i];
-      }
-    }
-    // get d_x
-    auto d_x = context.Output<Tensor>(framework::GradVarName("Input"));
-    if (d_x != nullptr) {
-      auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
-      std::fill(d_x_data, d_x_data + d_x->numel(), 0.0);
-      auto d_x_matrix = EigenMatrix<T>::From(*d_x);
-      auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
-      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        d_x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) +=
-            w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i];
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/net_op.cc b/paddle/operators/net_op.cc
deleted file mode 100644
index 000e029840ceb842941f9cbad5758209b6fd4dd5..0000000000000000000000000000000000000000
--- a/paddle/operators/net_op.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/operators/net_op.h"
-#include <set>
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-const char NetOp::kAll[] = "all";
-
-void NetOp::CompleteAddOp(bool calc) {
-  add_op_done_ = true;
-  if (!calc) return;
-  std::set<std::string> input_set;
-  std::set<std::string> output_set;
-  for (auto& op : ops_) {
-    for (auto& ipt : op->Inputs()) {
-      for (auto& var_name : ipt.second) {
-        // If input variable has been in output set, then it will be
-        // added into intermediate_outputs_. Otherwise, it will be
-        // added into input set.
-        if (Contains(output_set, var_name)) {
-          intermediate_outputs_.insert(var_name);
-        } else {
-          input_set.insert(var_name);
-        }
-      }
-    }
-
-    for (auto& opt : op->Outputs()) {
-      for (auto& var_name : opt.second) {
-        output_set.insert(var_name);
-      }
-    }
-  }
-  auto& inputs = inputs_[kAll];
-  inputs.reserve(input_set.size());
-  std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs));
-  auto& outputs = outputs_[kAll];
-  outputs.reserve(output_set.size());
-  std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs));
-}
-
-std::string NetOp::DebugStringEx(const framework::Scope* scope) const {
-  std::ostringstream os;
-  os << OperatorBase::DebugStringEx(scope) << std::endl;
-  for (auto& op : ops_) {
-    std::istringstream is(op->DebugStringEx(scope));
-    for (std::string line; std::getline(is, line);) {
-      os << "    " << line << std::endl;
-    }
-  }
-  return os.str();
-}
-
-bool NetOp::IsNetOp() const { return true; }
-
-std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const {
-  std::vector<std::string> all;
-  for (auto& pair : this->outputs_) {
-    for (auto& var_name : pair.second) {
-      all.push_back(var_name);
-    }
-  }
-  if (has_intermediate) {
-    return all;
-  }
-  std::vector<std::string> ret_val;
-  for (auto& each : all) {
-    if (!Contains(intermediate_outputs_, each)) {
-      ret_val.push_back(each);
-    }
-  }
-  return ret_val;
-}
-
-NetOp::NetOp(const std::string& type, const framework::VariableNameMap& inputs,
-             const framework::VariableNameMap& outputs,
-             const framework::AttributeMap& attrs)
-    : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
-std::unique_ptr<framework::OperatorBase> NetOp::Clone() const {
-  PADDLE_ENFORCE(
-      add_op_done_,
-      "Must clone a sealed NetOp, invoke Net::CompleteAddOp before clone");
-  return std::unique_ptr<OperatorBase>(new NetOp(*this));
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/net_op.h b/paddle/operators/net_op.h
deleted file mode 100644
index b24042f5ef5822eabcada8ed9d21c552579e8064..0000000000000000000000000000000000000000
--- a/paddle/operators/net_op.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <set>
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-/**
- * @brief Network is also a type of Operator
- *
- * It will manage the operators it has.
- *
- * Network is the container and controller of a set of operators.
-
- * A network object knows all Operators belonging to this network. Variables,
- * which are inputs and outputs of these operators, are created and managed by a
- * hierarchy of Scope objects.
- *
- * This is the base class of network, all the networks should implement the APIs
- * it defines.
- */
-class NetOp : public framework::OperatorBase {
- public:
-  static const char kAll[];
-  NetOp()
-      : framework::OperatorBase("plain_net", framework::VariableNameMap{},
-                                framework::VariableNameMap{},
-                                framework::AttributeMap{}) {}
-
-  NetOp(const std::string& type, const framework::VariableNameMap& inputs,
-        const framework::VariableNameMap& outputs,
-        const framework::AttributeMap& attrs);
-
-  NetOp(const NetOp& o) : framework::OperatorBase(o.type_, {}, {}, o.attrs_) {
-    this->ops_.reserve(o.ops_.size());
-    std::transform(
-        o.ops_.begin(), o.ops_.end(), std::back_inserter(this->ops_),
-        [](const std::unique_ptr<framework::OperatorBase>& op) {
-          return std::unique_ptr<framework::OperatorBase>(op->Clone());
-        });
-    this->CompleteAddOp();
-  }
-
-  /**
-   * @brief Run the network.
-   *
-   * Run all the operators with the `scope`, if no scope is provided, default
-   * scope will be used instead. If no OpContext is provicded, default context
-   * will be used.
-   */
-  void Run(const framework::Scope& scope,
-           const platform::Place& place) const override {
-    for (auto& op : ops_) {
-      op->Run(scope, place);
-    }
-  }
-
-  bool SupportGPU() const override {
-    for (auto& op : ops_) {
-      if (!op->SupportGPU()) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  void AppendOp(const framework::OperatorBase& op) { AppendOp(op.Clone()); }
-
-  /**
-   * @brief Add an operator by ptr
-   */
-  void AppendOp(std::unique_ptr<framework::OperatorBase> op) {
-    PADDLE_ENFORCE(!add_op_done_,
-                   "Cannot AppendOp when this network is sealed");
-    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
-    ops_.push_back(std::move(op));
-  }
-
-  void InsertOp(size_t pos, std::unique_ptr<framework::OperatorBase> op) {
-    PADDLE_ENFORCE(!add_op_done_,
-                   "Cannot InsertOp when this network is sealed");
-    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
-    PADDLE_ENFORCE_LE(pos, ops_.size(), "Out of range");
-    ops_.insert(ops_.begin() + pos, std::move(op));
-  }
-
-  void InsertOp(size_t pos, const framework::OperatorBase& op) {
-    InsertOp(pos, op.Clone());
-  }
-
-  void CompleteAddOp(bool calculate = true);
-
-  std::string DebugStringEx(
-      const framework::Scope* scope = nullptr) const override;
-
-  bool IsNetOp() const override;
-  std::vector<std::string> OutputVars(bool has_intermediate) const override;
-
-  std::unique_ptr<framework::OperatorBase> Clone() const override;
-
-  std::vector<std::unique_ptr<framework::OperatorBase>> ops_;
-
- private:
-  bool add_op_done_{false};
-  std::set<std::string> intermediate_outputs_;
-
-  template <typename T, typename KeyType>
-  static bool Contains(T container, KeyType key) {
-    return container.find(key) != container.end();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/net_op_test.cc b/paddle/operators/net_op_test.cc
deleted file mode 100644
index 9358f29f62fc21801f8036400d2baebdfd663a3a..0000000000000000000000000000000000000000
--- a/paddle/operators/net_op_test.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/operators/net_op.h"
-
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace operators {
-using Scope = framework::Scope;
-using DeviceContext = platform::DeviceContext;
-
-static int run_cnt = 0;
-
-class TestOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-  DEFINE_OP_CLONE_METHOD(TestOp);
-  void Run(const Scope& scope, const platform::Place& place) const override {
-    ++run_cnt;
-  }
-};
-
-template <typename T>
-void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
-                                  const std::vector<T>& actual) {
-  ASSERT_EQ(expected.size(), actual.size());
-  std::unordered_set<T> expected_set;
-  for (auto& tmp : expected) {
-    expected_set.insert(tmp);
-  }
-  for (auto& act : actual) {
-    ASSERT_NE(expected_set.end(), expected_set.find(act));
-  }
-}
-
-TEST(OpKernel, all) {
-  auto net = std::make_shared<NetOp>();
-  ASSERT_NE(net, nullptr);
-
-  net->AppendOp(std::unique_ptr<TestOp>(
-      new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                 {{"Out", {"y"}}}, framework::AttributeMap{})));
-  net->AppendOp(std::unique_ptr<TestOp>(
-      new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}},
-                 {{"Out", {"z"}}}, framework::AttributeMap{})));
-
-  net->CompleteAddOp();
-  AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"},
-                               net->Inputs(NetOp::kAll));
-  AssertSameVectorWithoutOrder({"y", "z"}, net->Outputs(NetOp::kAll));
-
-  auto final_outs = net->OutputVars(false);
-
-  ASSERT_EQ(final_outs.size(), 1UL);
-  ASSERT_EQ(final_outs[0], "z");
-}
-
-TEST(NetOp, insert_op) {
-  NetOp net;
-  auto op1 = std::unique_ptr<framework::NOP>(
-      new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                         {{"Out", {"y"}}}, framework::AttributeMap{}));
-  net.AppendOp(*op1);
-  net.InsertOp(0, *op1);
-  ASSERT_EQ(2UL, net.ops_.size());
-  net.InsertOp(2, std::move(op1));
-  ASSERT_EQ(3UL, net.ops_.size());
-}
-
-TEST(NetOp, Clone) {
-  NetOp net;
-  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
-      "empty", framework::VariableNameMap{}, framework::VariableNameMap{},
-      framework::AttributeMap{}}));
-  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
-      "empty2", framework::VariableNameMap{}, framework::VariableNameMap{},
-      framework::AttributeMap{}}));
-  net.CompleteAddOp(true);
-  auto new_net_op = net.Clone();
-  ASSERT_NE(new_net_op, nullptr);
-  ASSERT_TRUE(new_net_op->IsNetOp());
-  auto* new_net = static_cast<NetOp*>(new_net_op.get());
-  ASSERT_EQ(2UL, new_net->ops_.size());
-  ASSERT_EQ(new_net->ops_[0]->Type(), "empty");
-  ASSERT_EQ(new_net->ops_[1]->Type(), "empty2");
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/norm_op.cc b/paddle/operators/norm_op.cc
deleted file mode 100644
index 0eeafcaae0a01ed7090800ecb06708773ede67aa..0000000000000000000000000000000000000000
--- a/paddle/operators/norm_op.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/norm_op.h"
-namespace paddle {
-namespace operators {
-
-template <typename AttrType>
-class NormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor of norm operator. "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
-        "number of channels, H and W is the height and width of feature.");
-    AddInput("Scale",
-             "(Tensor) The input tensor of norm operator. "
-             "The format of input tensor is C * 1.");
-    AddAttr<AttrType>("epsilon",
-                      "(float, default 1e-10) Constant "
-                      "for numerical stability.")
-        .SetDefault(1.0e-10f);
-    AddOutput("Out",
-              "(Tensor) The output tensor of norm operator."
-              "N * M."
-              "M = C * H * W");
-    AddComment(R"DOC(
-       "Input shape: $(N, C, H, W)$
-        Scale shape: $(C, 1)$
-        Output shape: $(N, C, H, W)$
-        Where
-        forward
-          $$
-            [\frac {x_{1}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{2}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{3}}{\sqrt{\sum{x_{i}^{2}}}} \cdot  \cdot  \cdot \frac {x_{n}}{\sqrt{\sum{x_{i}^{2}}}}]
-          $$
-        backward
-          $$
-            \frac{\frac{\mathrm{d}L }{\mathrm{d}y_{1}} - \frac {x_{1}\sum {\frac{\mathrm{d} L}{\mathrm{d} y_{j}}}x_{j}}{\sum x_{j}^{2}} }{\sqrt{\sum{x_{j}^{2}}}}
-          $$
-        )DOC");
-  }
-};
-
-class NormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of NormOp"
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Scale"),
-                   "Input(Scale) of NormOp"
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of NormOp should not be null.");
-    auto in_x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", in_x_dims);
-  }
-};
-
-class NormOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Input(X@GRAD) should not be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(norm, ops::NormOp, ops::NormOpMaker<float>, norm_grad,
-            ops::NormOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    norm, ops::NormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::NormKernel<paddle::platform::CPUDeviceContext, double, float>);
-REGISTER_OP_CPU_KERNEL(
-    norm_grad, ops::NormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::NormGradKernel<paddle::platform::CPUDeviceContext, double, float>);
diff --git a/paddle/operators/norm_op.cu b/paddle/operators/norm_op.cu
deleted file mode 100644
index 2941c89b93141388d3746cf128848960ed6f3625..0000000000000000000000000000000000000000
--- a/paddle/operators/norm_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#define EIGEN_USE_GPU
-
-#include "paddle/operators/norm_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    norm, ops::NormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::NormKernel<paddle::platform::CUDADeviceContext, double, float>);
-REGISTER_OP_CUDA_KERNEL(
-    norm_grad, ops::NormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::NormGradKernel<paddle::platform::CUDADeviceContext, double, float>);
diff --git a/paddle/operators/norm_op.h b/paddle/operators/norm_op.h
deleted file mode 100644
index 5759d6f1f07d50d4c65c93578ba7d7916a2d9b4e..0000000000000000000000000000000000000000
--- a/paddle/operators/norm_op.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class NormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
-    const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
-    out->mutable_data<T>(context.GetPlace());
-    int batch_size = in_x->dims()[0];
-    int channels = in_x->dims()[1];
-    int height = in_x->dims()[2];
-    int width = in_x->dims()[3];
-    int fea_len = height * width;
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-    auto x =
-        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *in_x, framework::make_ddim({batch_size, fea_len * channels}));
-    // get square
-    framework::Tensor x_square;
-    x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
-    auto x_square_eigen =
-        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            x_square, framework::make_ddim({batch_size, fea_len * channels}));
-    x_square_eigen.device(*place) = x.square();
-    auto scale_eigen =
-        framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
-            *scale);
-    for (int n = 0; n < batch_size; ++n) {
-      framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
-      auto in_x_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              in_x_batch, framework::make_ddim({channels, fea_len}));
-      framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
-      auto x_square_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              x_square_batch, framework::make_ddim({channels, fea_len}));
-      framework::Tensor out_batch = out->Slice(n, n + 1);
-      auto out_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              out_batch, framework::make_ddim({channels, fea_len}));
-      framework::Tensor tmp_tensor;
-      tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
-                                 context.GetPlace());
-      auto tmp = framework::EigenVector<T, Eigen::RowMajor,
-                                        Eigen::DenseIndex>::Flatten(tmp_tensor);
-      // get colsum and sqrt , inverse
-      auto dim = Eigen::array<int, 1>({{0}});
-      tmp.device(*place) = x_square_batch_eigen.sum(dim);
-      tmp.device(*place) = (tmp + epsilon).sqrt().inverse();
-      Eigen::array<int, 2> broadcast_dim_col;
-      broadcast_dim_col[1] = 1;
-      broadcast_dim_col[0] = channels;
-      out_batch_eigen.device(*place) =
-          in_x_batch_eigen * (tmp.broadcast(broadcast_dim_col));
-      Eigen::array<int, 2> broadcast_dim_row;
-      broadcast_dim_row[1] = fea_len;
-      broadcast_dim_row[0] = 1;
-      out_batch_eigen.device(*place) =
-          out_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
-    }
-  }
-};
-template <typename DeviceContext, typename T, typename AttrType = T>
-class NormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
-    const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
-    const framework::Tensor* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
-    framework::Tensor* in_x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    in_x_grad->mutable_data<T>(context.GetPlace());
-    int batch_size = in_x->dims()[0];
-    int channels = in_x->dims()[1];
-    int height = in_x->dims()[2];
-    int width = in_x->dims()[3];
-    int fea_len = height * width;
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-
-    auto scale_eigen =
-        framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
-            *scale);
-    auto x =
-        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *in_x, framework::make_ddim({batch_size, fea_len * channels}));
-    // get square
-    framework::Tensor x_square;
-    x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
-    auto x_square_eigen =
-        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            x_square, framework::make_ddim({batch_size, fea_len * channels}));
-    x_square_eigen.device(*place) = x.square();
-
-    for (int n = 0; n < batch_size; ++n) {
-      framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
-      auto in_x_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              in_x_batch, framework::make_ddim({channels, fea_len}));
-      framework::Tensor in_g_batch = in_x_grad->Slice(n, n + 1);
-      auto in_g_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              in_g_batch, framework::make_ddim({channels, fea_len}));
-      framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
-      auto x_square_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              x_square_batch, framework::make_ddim({channels, fea_len}));
-      framework::Tensor outg_batch = out_grad->Slice(n, n + 1);
-      auto outg_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              outg_batch, framework::make_ddim({channels, fea_len}));
-
-      framework::Tensor tmp_tensor;
-      tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
-                                 context.GetPlace());
-      auto tmp_eigen =
-          framework::EigenVector<T, Eigen::RowMajor,
-                                 Eigen::DenseIndex>::Flatten(tmp_tensor);
-      auto dim = Eigen::array<int, 1>({{0}});
-      tmp_eigen.device(*place) = (in_x_batch_eigen * outg_batch_eigen).sum(dim);
-      framework::Tensor norm_tmp_tensor;
-      norm_tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
-                                      context.GetPlace());
-      auto norm_tmp_eigen =
-          framework::EigenVector<T, Eigen::RowMajor,
-                                 Eigen::DenseIndex>::Flatten(norm_tmp_tensor);
-      norm_tmp_eigen.device(*place) =
-          (x_square_batch_eigen.sum(dim) + epsilon).sqrt();
-      Eigen::array<int, 2> broadcast_dim_col;
-      broadcast_dim_col[1] = 1;
-      broadcast_dim_col[0] = channels;
-      in_g_batch_eigen.device(*place) =
-          in_x_batch_eigen * tmp_eigen.broadcast(broadcast_dim_col);
-      in_g_batch_eigen.device(*place) =
-          in_g_batch_eigen /
-          (norm_tmp_eigen * norm_tmp_eigen).broadcast(broadcast_dim_col);
-      in_g_batch_eigen.device(*place) = outg_batch_eigen - in_g_batch_eigen;
-      // outg_batch_eigen + (in_g_batch_eigen * -1);
-      in_g_batch_eigen.device(*place) =
-          in_g_batch_eigen / norm_tmp_eigen.broadcast(broadcast_dim_col);
-      Eigen::array<int, 2> broadcast_dim_row;
-      broadcast_dim_row[1] = fea_len;
-      broadcast_dim_row[0] = 1;
-      in_g_batch_eigen.device(*place) =
-          in_g_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/one_hot_op.cc b/paddle/operators/one_hot_op.cc
deleted file mode 100644
index e78b7468de4ea5f29378c2dc5905fdd36fb0ae2f..0000000000000000000000000000000000000000
--- a/paddle/operators/one_hot_op.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/operators/one_hot_op.h"
-#include "paddle/framework/framework.pb.h"
-
-namespace paddle {
-namespace operators {
-
-class OneHotOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of OneHotOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of OneHotOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "Rank of Input(X) should be at least 2.");
-    PADDLE_ENFORCE_GE(x_dims[x_dims.size() - 1], 1U,
-                      "Last dimension of Input(X) should be 1.");
-
-    int depth = ctx->Attrs().Get<int>("depth");
-
-    PADDLE_ENFORCE_GT(depth, 0, "Should provide a positive depth (%d).", depth);
-
-    framework::DDim out_dims(x_dims);
-    out_dims[out_dims.size() - 1] = depth;
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", /* --> */ "Out");
-  }
-};
-
-class OneHotOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  OneHotOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(LoDTensor, LoDTensor<int>) Input variable with rank at least 2. "
-             "The last dimension of X should be 1. Each value of X is an index "
-             "to indicate the position.");
-    AddOutput("Out",
-              "(Tensor, Tensor<float>) Output tensor with same rank as X. "
-              "The tensor consists of one-hot representations of values in X.");
-    AddAttr<int>("depth",
-                 "A positive integer to specify the length of one-hot vector.");
-    AddAttr<int>("dtype",
-                 "An integer to specify the data type of one-hot "
-                 "vector. The default value is FP32.")
-        .SetDefault(paddle::framework::proto::DataType::FP32);
-    AddComment(R"DOC(
-One Hot Operator. This operator creates the one-hot representations for input
-index values. The following example will help to explain the function of this
-operator:
-
-X is a LoDTensor:
-  X.lod = [[0, 1, 4]]
-  X.shape = [4, 1]
-  X.data = [[1], [1], [3], [0]]
-
-set depth = 4
-
-Out is a LoDTensor:
-  Out.lod = [[0, 1, 4]]
-  Out.shape = [4, 4]
-  Out.data = [[0., 1., 0., 0.],
-              [0., 1., 0., 0.],
-              [0., 0., 0., 1.],
-              [1., 0., 0., 0.]]
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(one_hot, ops::OneHotOp, ops::OneHotOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    one_hot, ops::OneHotKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::OneHotKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/one_hot_op.cu b/paddle/operators/one_hot_op.cu
deleted file mode 100644
index 16f6d9433eabd7be157ed57362a0d55d86c6ee92..0000000000000000000000000000000000000000
--- a/paddle/operators/one_hot_op.cu
+++ /dev/null
@@ -1,80 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/operators/one_hot_op.h"
-#include "paddle/platform/cuda_helper.h"
-#include "paddle/platform/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <typename InT, typename OutT>
-__global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data,
-                                 const int64_t numel, const int depth) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < numel) {
-    *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0;
-  }
-}
-
-template <typename DeviceContext, typename InT>
-struct OneHotOpCUDAFunctor {
-  const framework::LoDTensor* in_;
-  framework::LoDTensor* out_;
-  const DeviceContext& ctx_;
-  int depth_;
-
-  OneHotOpCUDAFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
-                      int depth, const DeviceContext& ctx)
-      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
-
-  template <typename OutT>
-  void operator()() const {
-    auto* p_in_data = in_->data<InT>();
-    auto numel = in_->numel();
-    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
-    auto stream = ctx_.stream();
-    math::set_constant(ctx_, out_, 0.0);
-
-    FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                           PADDLE_CUDA_NUM_THREADS,
-                       PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        p_in_data, p_out_data, numel, depth_);
-  }
-};
-
-using LoDTensor = framework::LoDTensor;
-template <typename DeviceContext, typename T>
-class OneHotCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int depth = context.Attr<int>("depth");
-
-    framework::VisitDataType(
-        static_cast<framework::proto::DataType>(context.Attr<int>("dtype")),
-        OneHotOpCUDAFunctor<DeviceContext, T>(
-            in, out, depth, context.template device_context<DeviceContext>()));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    one_hot, ops::OneHotCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::OneHotCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/operators/one_hot_op.h b/paddle/operators/one_hot_op.h
deleted file mode 100644
index 12031ede2c3cd042a3d25003b714652b4d0d4453..0000000000000000000000000000000000000000
--- a/paddle/operators/one_hot_op.h
+++ /dev/null
@@ -1,68 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename InT>
-struct OneHotOpFunctor {
-  const framework::LoDTensor* in_;
-  framework::LoDTensor* out_;
-  int depth_;
-  const DeviceContext& ctx_;
-
-  OneHotOpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
-                  int depth, const DeviceContext& ctx)
-      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
-
-  template <typename OutT>
-  void operator()() const {
-    auto* p_in_data = in_->data<InT>();
-    auto numel = in_->numel();
-    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
-    math::set_constant(ctx_, out_, 0.0);
-
-    for (int i = 0; i < numel; ++i) {
-      PADDLE_ENFORCE_GE(p_in_data[i], 0,
-                        "Illegal index value, should be at least 0.");
-      PADDLE_ENFORCE_LT(p_in_data[i], depth_,
-                        "Illegal index value, should be less than depth (%d).",
-                        depth_);
-      *(p_out_data + i * depth_ + p_in_data[i]) = 1.0;
-    }
-  }
-};
-
-using LoDTensor = framework::LoDTensor;
-template <typename DeviceContext, typename T>
-class OneHotKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int depth = context.Attr<int>("depth");
-
-    framework::VisitDataType(
-        static_cast<framework::proto::DataType>(context.Attr<int>("dtype")),
-        OneHotOpFunctor<DeviceContext, T>(
-            in, out, depth, context.template device_context<DeviceContext>()));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc
deleted file mode 100644
index 90c53bd17732aa046beceb3ac0a3b8c0d69994f3..0000000000000000000000000000000000000000
--- a/paddle/operators/pad_op.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/pad_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class PadOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of PadOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of PadOp should not be null.");
-
-    auto x_dim = ctx->GetInputDim("X");
-    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    PADDLE_ENFORCE_EQ(x_dim.size() * 2, int64_t(paddings.size()),
-                      "Size of paddings should be equal to 2 * dimension size "
-                      "of input tensor.");
-    std::vector<int64_t> out_dims(x_dim.size());
-    for (int i = 0; i < x_dim.size(); ++i) {
-      out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
-    }
-    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
-    if (out_dims[0] == x_dim[0]) {
-      // Only pass LoD when the first dimension is equal between
-      // output and input.
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-};
-
-class PadOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  PadOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "The input of pad op. "
-             "The input should be a k-D tensor(k > 0 and k < 7)");
-    AddOutput("Out",
-              "The output of pad op. "
-              "A tensor with the same shape as X.");
-    AddAttr<std::vector<int>>(
-        "paddings",
-        "(vector<int>) "
-        "A list<int> to describe the padding rules for each dimension. "
-        "For 2-D image tensor, paddings=[0, 1, 2, 3] means "
-        "padding 0 row to top, 1 row to bottom, 2 columns to left "
-        "and 3 columns to right. Size of paddings should be equal to "
-        "2 * dimension size of the input tensor.");
-    AddAttr<float>("pad_value",
-                   "(float, default 0.0) "
-                   "The value to fill the padded areas.")
-        .SetDefault(0.0f);
-    AddComment(R"DOC(
-Pad Operator.
-
-Pad input into output, as specified by paddings and pad_value. 
-The input should be a k-D tensor(k > 0 and k < 7). As an example:
-
-Given:
-
-X = [[1, 2],
-     [3, 4]],
-
-paddings = [0, 1, 1, 2],
-
-and
-
-pad_value = 0,
-
-we have:
-
-Out = [[0, 1, 2, 0, 0]
-       [0, 3, 4, 0, 0]
-       [0, 0, 0, 0, 0]]
-
-)DOC");
-  }
-};
-
-class PadOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-  }
-};
-
-class PadOpGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* bind = new framework::OpDesc();
-    bind->SetInput("X", Input("X"));
-    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    bind->SetAttrMap(Attrs());
-    bind->SetType("pad_grad");
-    return std::unique_ptr<framework::OpDesc>(bind);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ops::PadOpGradMaker);
-REGISTER_OPERATOR(pad_grad, ops::PadOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    pad, ops::PadKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    pad_grad, ops::PadGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/pad_op.cu b/paddle/operators/pad_op.cu
deleted file mode 100644
index 433b5f1112a27b36edbe6d99fcdd4fc8395bc2e8..0000000000000000000000000000000000000000
--- a/paddle/operators/pad_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/pad_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    pad, ops::PadKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    pad_grad, ops::PadGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/pad_op.h b/paddle/operators/pad_op.h
deleted file mode 100644
index fdf91a5776620485c38a8b2c5f8b26039e438d0c..0000000000000000000000000000000000000000
--- a/paddle/operators/pad_op.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T, size_t D>
-void PadFunction(const framework::ExecutionContext& context) {
-  auto pads = context.Attr<std::vector<int>>("paddings");
-  Eigen::array<std::pair<int, int>, D> paddings;
-  for (size_t i = 0; i < paddings.size(); ++i) {
-    paddings[i].first = pads[i * 2];
-    paddings[i].second = pads[i * 2 + 1];
-  }
-  T pad_value = context.Attr<T>("pad_value");
-
-  auto* x = context.Input<Tensor>("X");
-  auto* out = context.Output<Tensor>("Out");
-  out->mutable_data<T>(context.GetPlace());
-
-  auto x_tensor = EigenTensor<T, D>::From(*x);
-  auto out_tensor = EigenTensor<T, D>::From(*out);
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
-  out_tensor.device(place) = x_tensor.pad(paddings, pad_value);
-}
-
-template <typename DeviceContext, typename T>
-class PadKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    int rank = context.Input<Tensor>("X")->dims().size();
-    switch (rank) {
-      case 1:
-        PadFunction<DeviceContext, T, 1>(context);
-        break;
-      case 2:
-        PadFunction<DeviceContext, T, 2>(context);
-        break;
-      case 3:
-        PadFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        PadFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        PadFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        PadFunction<DeviceContext, T, 6>(context);
-        break;
-      default:
-        PADDLE_THROW(
-            "PadOp only support tensors with no more than 6 dimensions.");
-    }
-  }
-};
-
-template <typename DeviceContext, typename T, size_t D>
-void PadGradFunction(const framework::ExecutionContext& context) {
-  auto pads = context.Attr<std::vector<int>>("paddings");
-  Eigen::array<std::pair<int, int>, D> paddings;
-  for (size_t i = 0; i < paddings.size(); ++i) {
-    paddings[i].first = -pads[i * 2];
-    paddings[i].second = -pads[i * 2 + 1];
-  }
-  auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-  auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-  if (d_x != nullptr) {
-    d_x->mutable_data<T>(context.GetPlace());
-    auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
-    auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class PadGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    size_t rank =
-        context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
-    switch (rank) {
-      case 1:
-        PadGradFunction<DeviceContext, T, 1>(context);
-        break;
-      case 2:
-        PadGradFunction<DeviceContext, T, 2>(context);
-        break;
-      case 3:
-        PadGradFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        PadGradFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        PadGradFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        PadGradFunction<DeviceContext, T, 6>(context);
-        break;
-      default:
-        PADDLE_THROW(
-            "PadOp only support tensors with no more than 6 dimensions.");
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/parallel_do_op.cc b/paddle/operators/parallel_do_op.cc
deleted file mode 100644
index 950a95ae360cf7976845bf1d794dc4b254a32ba5..0000000000000000000000000000000000000000
--- a/paddle/operators/parallel_do_op.cc
+++ /dev/null
@@ -1,407 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-
-#include "paddle/framework/executor.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/threadpool.h"
-#include "paddle/operators/detail/safe_ref.h"
-
-namespace paddle {
-namespace operators {
-
-static constexpr char kInputs[] = "inputs";
-static constexpr char kParameters[] = "parameters";
-static constexpr char kPlaces[] = "places";
-
-static constexpr char kOutputs[] = "outputs";
-static constexpr char kParallelScopes[] = "parallel_scopes";
-
-static constexpr char kParallelBlock[] = "sub_block";
-static constexpr char kUseNCCL[] = "use_nccl";
-
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-
-static void SplitTensorAndMoveTensorToScopes(
-    const framework::Scope &scope, std::vector<framework::Scope *> *sub_scopes,
-    const std::vector<platform::Place> &places,
-    const std::vector<std::string> &names) {
-  size_t num_sub_scopes = 0;
-  for (auto &argu : names) {
-    const auto &tensor =
-        detail::Ref(scope.FindVar(argu),
-                    "Cannot find variable %s in the parent scope", argu)
-            .Get<LoDTensor>();
-    auto lod_tensors = tensor.SplitLoDTensor(places);
-
-    for (auto &lod : lod_tensors) {
-      VLOG(3) << lod.dims();
-    }
-    if (num_sub_scopes == 0) {
-      num_sub_scopes = lod_tensors.size();
-    } else {
-      PADDLE_ENFORCE_EQ(num_sub_scopes, lod_tensors.size());
-    }
-    PADDLE_ENFORCE_NE(num_sub_scopes, 0);
-    if (sub_scopes->size() == 0) {
-      sub_scopes->reserve(num_sub_scopes);
-      for (size_t i = 0; i < num_sub_scopes; ++i) {
-        sub_scopes->emplace_back(&scope.NewScope());
-      }
-    }
-
-    for (size_t i = 0; i < lod_tensors.size(); ++i) {
-      *detail::Ref(sub_scopes->at(i)->Var(argu),
-                   "Cannot find variable in the sub-scope", argu)
-           .GetMutable<LoDTensor>() = lod_tensors[i];
-    }
-  }
-}
-
-inline void CopyOrShare(const framework::Variable &src,
-                        const platform::Place &dst_place,
-                        framework::Variable *dst) {
-  if (src.IsType<LoDTensor>()) {
-    if (src.Get<LoDTensor>().place() == dst_place) {
-      dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>());
-      dst->GetMutable<LoDTensor>()->set_lod(src.Get<LoDTensor>().lod());
-    } else {
-      Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
-      framework::LoD lod(src.Get<LoDTensor>().lod());
-      lod.CopyToPeer(dst_place);
-      dst->GetMutable<LoDTensor>()->set_lod(lod);
-    }
-  } else if (src.IsType<SelectedRows>()) {
-    auto &src_sr = src.Get<SelectedRows>();
-    auto *dst_sr = dst->GetMutable<SelectedRows>();
-    dst_sr->set_height(src_sr.height());
-    if (src_sr.value().place() == dst_place) {
-      dst_sr->mutable_value()->ShareDataWith(src_sr.value());
-      dst_sr->set_rows(src_sr.rows());
-    } else {
-      Copy(src_sr.value(), dst_place, dst_sr->mutable_value());
-      framework::Vector<int64_t> lod(src_sr.rows());
-      lod.CopyToPeer(dst_place);
-      dst_sr->set_rows(lod);
-    }
-  } else {
-    PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
-  }
-}
-
-void WaitOnPlace(const platform::Place place) {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &dev_ctx = *pool.Get(place);
-  dev_ctx.Wait();
-}
-
-void WaitOnPlaces(const std::vector<platform::Place> places) {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-
-  for (auto &place : places) {
-    auto &dev_ctx = *pool.Get(place);
-    dev_ctx.Wait();
-  }
-}
-
-class ParallelDoOp : public framework::OperatorBase {
- public:
-  ParallelDoOp(const std::string &type,
-               const framework::VariableNameMap &inputs,
-               const framework::VariableNameMap &outputs,
-               const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
-    auto *program = block->Program();
-
-    auto &places = scope.FindVar(Input(kPlaces))->Get<platform::PlaceList>();
-
-    auto &sub_scopes = *scope.FindVar(Output(kParallelScopes))
-                            ->GetMutable<std::vector<framework::Scope *>>();
-
-    // split input
-    SplitTensorAndMoveTensorToScopes(scope, &sub_scopes, places,
-                                     Inputs(kInputs));
-
-    // copy parameter
-    for (auto &param : Inputs(kParameters)) {
-      PADDLE_ENFORCE(scope.FindVar(param)->IsType<LoDTensor>(),
-                     "Only support parameter type as LoDTensor");
-      auto &src = scope.FindVar(param)->Get<LoDTensor>();
-      for (size_t i = 0; i < sub_scopes.size(); ++i) {
-        auto &place = places[i];
-        auto *sub_scope = sub_scopes[i];
-        auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
-        framework::Copy(src, place, dst);
-        framework::LoD lod(src.lod());
-        lod.CopyToPeer(place);
-        dst->set_lod(lod);
-      }
-    }
-    WaitOnPlaces(places);
-
-    //    PADDLE_ENFORCE_EQ(places.size(), sub_scopes.size());
-    std::vector<std::future<void>> workers;
-    workers.reserve(places.size());
-    for (size_t place_idx = 0; place_idx < sub_scopes.size(); ++place_idx) {
-      auto &place = places[place_idx];
-      auto *cur_scope = sub_scopes[place_idx];
-
-      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
-        framework::Executor executor(place);
-        executor.Run(*program, cur_scope, block->ID(),
-                     false /*create_local_scope*/);
-      }));
-    }
-    for (auto &worker : workers) {
-      worker.wait();
-    }
-    WaitOnPlaces(places);
-
-    // merge output
-    for (auto &o_name : Outputs(kOutputs)) {
-      std::vector<const framework::LoDTensor *> lod_tensors;
-      lod_tensors.reserve(sub_scopes.size());
-      for (auto *sub_scope : sub_scopes) {
-        lod_tensors.emplace_back(&sub_scope->FindVar(o_name)->Get<LoDTensor>());
-      }
-
-      auto *lod_tensor_to_be_merged =
-          scope.FindVar(o_name)->GetMutable<LoDTensor>();
-      lod_tensor_to_be_merged->MergeLoDTensor(lod_tensors, dev_ctx.GetPlace());
-    }
-    WaitOnPlaces(places);
-  }
-};
-
-class ParallelDoOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ParallelDoOpProtoMaker(OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(kInputs, "").AsDuplicable();
-    AddInput(kParameters, "").AsDuplicable();
-    AddInput(kPlaces, "");
-    AddOutput(kOutputs, "").AsDuplicable();
-    AddOutput(kParallelScopes, "");
-    AddAttr<framework::BlockDesc *>(kParallelBlock, "");
-    AddAttr<bool>(kUseNCCL, "true if we use nccl on backward")
-        .SetDefault(false);
-    AddComment(R"DOC(
-ParallelDo Operator.
-)DOC");
-  }
-};
-
-class ParallelDoGradOp : public framework::OperatorBase {
- public:
-  ParallelDoGradOp(const std::string &type,
-                   const framework::VariableNameMap &inputs,
-                   const framework::VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
-    auto *program = block->Program();
-
-    auto &sub_scopes = scope.FindVar(Input(kParallelScopes))
-                           ->Get<std::vector<framework::Scope *>>();
-    auto &places = scope.FindVar(Input(kPlaces))->Get<platform::PlaceList>();
-    //    PADDLE_ENFORCE_EQ(places.size(), sub_scopes.size());
-
-    // feed output@grad
-    SplitTensorAndMoveTensorToScopes(
-        scope, const_cast<std::vector<framework::Scope *> *>(&sub_scopes),
-        places, Inputs(framework::GradVarName(kOutputs)));
-    WaitOnPlaces(places);
-    LOG(INFO) << "places " << places.size();
-
-    // exe run
-    std::vector<std::future<void>> workers;
-    for (size_t i = 0; i < sub_scopes.size(); ++i) {
-      auto &place = places[i];
-      auto *cur_scope = sub_scopes[i];
-      LOG(INFO) << place;
-
-      // execute
-      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
-        framework::Executor executor(place);
-        executor.Run(*program, cur_scope, block->ID(),
-                     false /*create_local_scope*/);
-      }));
-    }
-    LOG(INFO) << "places " << places.size();
-    for (auto &worker : workers) {
-      worker.wait();
-    }
-    WaitOnPlaces(places);
-
-    // NCCL allreduce op will be added by backward,
-    // so no need to explicitly accumulate grad
-    if (!(Attr<bool>(kUseNCCL))) {
-      AccumulateGrad(scope, place, sub_scopes, places);
-    } else {
-      for (auto &place : places) {
-        PADDLE_ENFORCE(platform::is_gpu_place(place),
-                       "NCCL only supports cuda place");
-      }
-    }
-    for (auto &s : Outputs(framework::GradVarName(kParameters))) {
-      CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
-    }
-    WaitOnPlaces(places);
-  }
-
-  void AccumulateGrad(const framework::Scope &scope,
-                      const platform::Place &place,
-                      const std::vector<framework::Scope *> &sub_scopes,
-                      const platform::PlaceList &places) const {
-    for (auto &s : Outputs(framework::GradVarName(kParameters))) {
-      VLOG(3) << "Accumulating " << s;
-      if (s == framework::kEmptyVarName) continue;
-      std::string tmp_name;
-      auto *tmp = sub_scopes[0]->Var(&tmp_name);
-
-      for (size_t i = 1; i < sub_scopes.size(); ++i) {
-        CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp);
-        WaitOnPlaces(places);
-
-        auto sum_op = framework::OpRegistry::CreateOp(
-            "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
-            framework::AttributeMap{});
-        VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
-        sum_op->Run(*sub_scopes[0], places[0]);
-        WaitOnPlace(places[0]);
-      }
-
-      CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
-    }
-    WaitOnPlaces(places);
-  }
-};
-
-std::ostream &operator<<(std::ostream &sout,
-                         const std::vector<std::string> &strs) {
-  std::copy(strs.begin(), strs.end(),
-            std::ostream_iterator<std::string>(sout, ","));
-  return sout;
-}
-
-class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  virtual std::unique_ptr<framework::OpDesc> Apply() const {
-    auto *grad = new framework::OpDesc();
-    grad->SetType("parallel_do_grad");
-    for (auto &input_param : this->InputNames()) {
-      VLOG(3) << input_param;
-      grad->SetInput(input_param, this->Input(input_param));
-      if (input_param != kPlaces) {
-        grad->SetOutput(framework::GradVarName(input_param),
-                        this->InputGrad(input_param, false));
-      }
-    }
-    auto *g_block = this->grad_block_[0];
-
-    // All variable name that needed by gradient operators
-    std::unordered_set<std::string> all_inputs_in_grad_blocks;
-
-    for (size_t i = 0; i < g_block->OpSize(); ++i) {
-      auto *op = g_block->Op(i);
-      for (auto &var_name : op->InputArgumentNames()) {
-        all_inputs_in_grad_blocks.insert(var_name);
-      }
-    }
-
-    for (auto &output_param : this->OutputNames()) {
-      if (output_param == kParallelScopes) {
-        grad->SetInput(output_param, this->Output(output_param));
-        grad->SetInput(framework::GradVarName(output_param),
-                       this->Output(output_param));
-      } else {
-        grad->SetInput(output_param, this->Output(output_param));
-        std::vector<std::string> og_names;
-        for (auto &og_name : this->OutputGrad(output_param)) {
-          if (all_inputs_in_grad_blocks.count(og_name) != 0) {
-            // there are some gradient operators who need the OG. So make this
-            // OG as an input of parallel.do
-            og_names.push_back(og_name);
-          }
-          // else, there is no operator who need the OG. Do not use this OG as
-          // an input
-        }
-        grad->SetInput(framework::GradVarName(output_param), og_names);
-      }
-    }
-    grad->SetAttrMap(this->Attrs());
-    grad->SetBlockAttr(kParallelBlock, *grad_block_[0]);
-
-    return std::unique_ptr<framework::OpDesc>(grad);
-  }
-};
-
-class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInputs(kParameters));
-    PADDLE_ENFORCE(ctx->HasInputs(kInputs));
-    PADDLE_ENFORCE(ctx->HasInputs(kOutputs));
-
-    ctx->SetOutputsDim(framework::GradVarName(kParameters),
-                       ctx->GetInputsDim(kParameters));
-
-    auto i_dims = ctx->GetInputsDim(kInputs);
-    auto ig_names = ctx->Outputs(framework::GradVarName(kInputs));
-
-    for (size_t i = 0; i < ig_names.size(); ++i) {
-      auto &ig_name = ig_names[i];
-      if (ig_name == framework::kEmptyVarName) {
-        continue;
-      }
-
-      ctx->SetDims({ig_name}, {i_dims[i]});
-    }
-
-    auto p_dims = ctx->GetInputsDim(kParameters);
-    auto pg_names = ctx->Outputs(framework::GradVarName(kParameters));
-    for (size_t i = 0; i < pg_names.size(); ++i) {
-      auto &pg_name = pg_names[i];
-      if (pg_name == framework::kEmptyVarName) {
-        continue;
-      }
-      ctx->SetDims({pg_name}, {p_dims[i]});
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp,
-                  paddle::operators::ParallelDoOpProtoMaker,
-                  paddle::operators::ParallelDoGradOpDescMaker);
-REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp,
-                  paddle::operators::ParallelDoGradOpShapeInference);
diff --git a/paddle/operators/pool_cudnn_op.cu.cc b/paddle/operators/pool_cudnn_op.cu.cc
deleted file mode 100644
index 446fb0819d98e0eb3d81bb202a67c55a23fc06b6..0000000000000000000000000000000000000000
--- a/paddle/operators/pool_cudnn_op.cu.cc
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/pool_op.h"
-#include "paddle/platform/cudnn_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor;
-using DataLayout = platform::DataLayout;
-using PoolingMode = platform::PoolingMode;
-
-template <typename T>
-class PoolCUDNNOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-
-    const Tensor *input = ctx.Input<Tensor>("X");
-    Tensor *output = ctx.Output<Tensor>("Out");
-
-    const T *input_data = input->data<T>();
-    T *output_data = output->mutable_data<T>(ctx.GetPlace());
-
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    if (ctx.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(input->dims()[i + 2]);
-      }
-    }
-
-    // ------------------- cudnn descriptors ---------------------
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor output_desc;
-    ScopedPoolingDescriptor pool_desc;
-    DataLayout layout;
-
-    if (strides.size() == 2U) {
-      layout = DataLayout::kNCHW;
-    } else {
-      layout = DataLayout::kNCDHW;
-    }
-
-    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize2int(input->dims()));
-    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, framework::vectorize2int(output->dims()));
-
-    PoolingMode pooling_mode;
-    if (pooling_type == "max") {
-      pooling_mode = PoolingMode::kMaximum;
-    } else {
-      pooling_mode = PoolingMode::kAverage;
-    }
-
-    cudnnPoolingDescriptor_t cudnn_pool_desc =
-        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
-
-    // ------------------- cudnn pool algorithm ---------------------
-    auto handle = ctx.cuda_device_context().cudnn_handle();
-    T alpha = 1.0f, beta = 0.0f;
-
-    PADDLE_ENFORCE(platform::dynload::cudnnPoolingForward(
-        handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
-        cudnn_output_desc, output_data));
-  }
-};
-
-template <typename T>
-class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-
-    const Tensor *input = ctx.Input<Tensor>("X");
-    const Tensor *output = ctx.Input<Tensor>("Out");
-    const Tensor *output_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor *input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-
-    if (ctx.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(input->dims()[i + 2]);
-      }
-    }
-
-    const T *input_data = input->data<T>();
-    const T *output_data = output->data<T>();
-    const T *output_grad_data = output_grad->data<T>();
-
-    // ------------------- cudnn descriptors ---------------------
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor output_desc;
-    ScopedPoolingDescriptor pool_desc;
-    DataLayout layout;
-
-    if (strides.size() == 2U) {
-      layout = DataLayout::kNCHW;
-    } else {
-      layout = DataLayout::kNCDHW;
-    }
-
-    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize2int(input->dims()));
-    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, framework::vectorize2int(output->dims()));
-
-    PoolingMode pooling_mode;
-    if (pooling_type == "max") {
-      pooling_mode = PoolingMode::kMaximum;
-    } else {
-      pooling_mode = PoolingMode::kAverage;
-    }
-
-    cudnnPoolingDescriptor_t cudnn_pool_desc =
-        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
-
-    // ------------------- cudnn pool algorithm ---------------------
-    auto handle = ctx.cuda_device_context().cudnn_handle();
-    T alpha = 1.0f, beta = 0.0f;
-
-    if (input_grad) {
-      T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      // Because beta is zero, it is unnecessary to reset input_grad.
-
-      PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward(
-          handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
-          cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
-          &beta, cudnn_input_desc, input_grad_data));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(pool2d, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::PoolCUDNNOpKernel<float>,
-                   ops::PoolCUDNNOpKernel<double>);
-REGISTER_OP_KERNEL(pool2d_grad, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<double>);
-
-REGISTER_OP_KERNEL(pool3d, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::PoolCUDNNOpKernel<float>,
-                   ops::PoolCUDNNOpKernel<double>);
-REGISTER_OP_KERNEL(pool3d_grad, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<double>);
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
deleted file mode 100644
index b97333bb1a13a0170c325520b86ac73e68282f91..0000000000000000000000000000000000000000
--- a/paddle/operators/pool_op.cc
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/pool_op.h"
-
-namespace paddle {
-namespace operators {
-
-int OutputSizePool(int input_size, int filter_size, int padding, int stride) {
-  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
-  return output_size;
-}
-
-void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) of Pooling should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                 "Out(Output) of Pooling should not be null.");
-
-  auto in_x_dims = ctx->GetInputDim("X");
-
-  std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
-  std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
-  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-
-  PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
-                 "Pooling intput should be 4-D or 5-D tensor.");
-
-  if (ctx->Attrs().Get<bool>("global_pooling")) {
-    ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      paddings[i] = 0;
-      ksize[i] = static_cast<int>(in_x_dims[i + 2]);
-    }
-  }
-
-  PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
-                 "Input size and pooling size should be consistent.");
-  PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
-                    "Strides size and pooling size should be the same.");
-  PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
-                    "Paddings size and pooling size should be the same.");
-
-  std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
-  for (size_t i = 0; i < ksize.size(); ++i) {
-    output_shape.push_back(
-        OutputSizePool(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
-  }
-  ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-  ctx->ShareLoD("X", "Out");
-}
-
-framework::OpKernelType PoolOp::GetExpectedKernelType(
-    const framework::ExecutionContext &ctx) const {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
-#ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
-  }
-#endif
-  framework::LibraryType library_;
-  if (use_cudnn) {
-    library_ = framework::LibraryType::kCUDNN;
-  } else {
-    library_ = framework::LibraryType::kPlain;
-  }
-
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-      layout_, library_);
-}
-
-void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                 "Input(X@GRAD) should not be null.");
-  ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-}
-
-framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
-    const framework::ExecutionContext &ctx) const {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
-#ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
-  }
-#endif
-  framework::LibraryType library_;
-  if (use_cudnn) {
-    library_ = framework::LibraryType::kCUDNN;
-  } else {
-    library_ = framework::LibraryType::kPlain;
-  }
-
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-      layout_, library_);
-}
-
-Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
-  AddInput(
-      "X",
-      "(Tensor) The input tensor of pooling operator. "
-      "The format of input tensor is NCHW, where N is batch size, C is the "
-      "number of channels, H is the height of the feature, "
-      "and W is the width of the feature.");
-  AddOutput("Out",
-            "(Tensor) The output tensor of pooling operator. "
-            "The format of output tensor is also NCHW, "
-            "where N is batch size, C is the number of channels, "
-            "H is the height of the feature, "
-            "and W is the width of the feature.");
-
-  AddAttr<std::string>("pooling_type",
-                       "(string), pooling type, can be \"max\" for max-pooling "
-                       "and \"avg\" for average-pooling.")
-      .InEnum({"max", "avg"});
-  AddAttr<std::vector<int>>("ksize",
-                            "(vector<int>) The pooling window "
-                            "size(height, width) of the pooling operator. "
-                            "If global_pooling = true, ksize and paddings will "
-                            "be ignored.");  // TODO(Chengduo): Add checker.
-                                             // (Currently,
-  // TypedAttrChecker don't support vector type.)
-  AddAttr<bool>("global_pooling",
-                "(bool, default false) Whether to use the global pooling. "
-                "If global_pooling = true, ksize and paddings will be ignored.")
-      .SetDefault(false);
-  AddAttr<std::vector<int>>("strides",
-                            "(vector<int>, default {1, 1}), strides(height, "
-                            "width) of pooling operator.")
-      .SetDefault({1, 1});
-  // TODO(Chengduo): Add checker. (Currently,
-  // TypedAttrChecker don't support vector type.)
-  AddAttr<std::vector<int>>(
-      "paddings",
-      "(vector<int>, default {0,0}), paddings(height, width) of pooling "
-      "operator."
-      "If global_pooling = true, paddings and ksize will be ignored.")
-      .SetDefault({0, 0});
-  AddAttr<bool>(
-      "use_cudnn",
-      "(bool, default false) Only used in cudnn kernel, need install cudnn")
-      .SetDefault(false);
-  AddAttr<std::string>(
-      "data_format",
-      "(string, default NCHW) Only used in "
-      "An optional string from: \"NHWC\", \"NCHW\". "
-      "Defaults to \"NHWC\". Specify the data format of the output data, "
-      "the input will be transformed automatically. ")
-      .SetDefault("AnyLayout");
-  // TODO(dzhwinter): need to registered layout transform function
-
-  AddComment(R"DOC(
-Pool2d Operator.
-
-The pooling2d operation calculates the output based on
-the input, pooling_type and ksize, strides, paddings parameters.
-Input(X) and output(Out) are in NCHW format, where N is batch size, C is the
-number of channels, H is the height of the feature, and W is the width of the feature.
-Parameters(ksize, strides, paddings) are two elements.
-These two elements represent height and width, respectively.
-The input(X) size and output(Out) size may be different.
-
-Example:   
-  Input:
-       X shape: $(N, C, H_{in}, W_{in})$
-  Output:
-       Out shape: $(N, C, H_{out}, W_{out})$
-  Where
-       $$ 
-       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
-       $$
-
-)DOC");
-}
-
-Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
-  AddInput("X",
-           "(Tensor) The input tensor of pooling operator. "
-           "The format of input tensor is NCDHW, where N is batch size, C is "
-           "the number of channels, and D, H and W is the depth, height and "
-           "width of "
-           "the feature, respectively.");
-  AddOutput("Out",
-            "(Tensor) The output tensor of pooling operator."
-            "The format of output tensor is also NCDHW, "
-            "where N is batch size, C is "
-            "the number of channels, and D, H and W is the depth, height and "
-            "width of the feature, respectively.");
-
-  AddAttr<std::string>("pooling_type",
-                       "(string) Pooling type, can be \"max\" for max-pooling "
-                       "and \"avg\" for average-pooling.")
-      .InEnum({"max", "avg"});
-  AddAttr<std::vector<int>>(
-      "ksize",
-      "(vector<int>) The pooling window size(depth, height, "
-      "width) of pooling operator. "
-      "If global_pooling = true, ksize and paddings will "
-      "be ignored.");  // TODO(Chengduo): Add checker.
-                       // (Currently,
-  // TypedAttrChecker don't support vector type.)
-  AddAttr<bool>(
-      "global_pooling",
-      "(bool, default false) Whether to use the global pooling. "
-      "If global_pooling = true, ksize and paddings wille be ignored.")
-      .SetDefault(false);
-  AddAttr<std::vector<int>>(
-      "strides",
-      "(vector<int>, default {1,1,1}) Strides(depth, height, "
-      "width) of the pooling operator.")
-      .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
-                               // TypedAttrChecker don't support vector type.)
-  AddAttr<std::vector<int>>(
-      "paddings",
-      "(vector<int>, default {0,0,0}), paddings(depth, height, "
-      "width) of pooling operator. "
-      "If global_pooling = true, ksize and paddings will be ignored.")
-      .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
-                               // TypedAttrChecker don't support vector type.)
-
-  AddAttr<bool>(
-      "use_cudnn",
-      "(bool, default false) Only used in cudnn kernel, need install cudnn")
-      .SetDefault(false);
-  AddAttr<std::string>(
-      "data_format",
-      "(string, default NCHW) Only used in "
-      "An optional string from: \"NHWC\", \"NCHW\". "
-      "Defaults to \"NHWC\". Specify the data format of the output data, "
-      "the input will be transformed automatically. ")
-      .SetDefault("AnyLayout");
-  // TODO(dzhwinter): need to registered layout transform function
-
-  AddComment(R"DOC(
-Pool3d Operator.
-
-The pooling3d operation calculates the output based on
-the input, pooling_type, ksize, strides, and paddings parameters.
-Input(X) and output(Out) are in NCDHW format, where N is batch
-size, C is the number of channels, and D, H and W are the depth, height and
-width of the feature, respectively. Parameters(ksize, strides, paddings) 
-are three elements. These three elements represent depth, height and 
-width, respectively. The input(X) size and output(Out) size may be different.
-
-Example:
-  Input:
-       X shape: $(N, C, D_{in}, H_{in}, W_{in})$
-  Output:
-       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
-  Where
-  $$
-       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
-       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
-  $$
-
-)DOC");
-}
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad,
-            ops::PoolOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    pool2d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    pool2d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>)
-
-REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad,
-            ops::PoolOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    pool3d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    pool3d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/pool_op.cu.cc b/paddle/operators/pool_op.cu.cc
deleted file mode 100644
index 39a9dfbf794b3dbaf81e2435f8609014dc27f3af..0000000000000000000000000000000000000000
--- a/paddle/operators/pool_op.cu.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/pool_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    pool2d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    pool2d_grad,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    pool3d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    pool3d_grad,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h
deleted file mode 100644
index d6ba5e298a4939e31fde71bf5bf8484640a7ceaf..0000000000000000000000000000000000000000
--- a/paddle/operators/pool_op.h
+++ /dev/null
@@ -1,183 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/pooling.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-class PoolOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class PoolOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  Pool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker);
-};
-
-class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  Pool3dOpMaker(OpProto* proto, OpAttrChecker* op_checker);
-};
-
-template <typename DeviceContext, typename T>
-class PoolKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-
-    std::string pooling_type = context.Attr<std::string>("pooling_type");
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
-      }
-    }
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    switch (ksize.size()) {
-      case 2: {
-        if (pooling_type == "max") {
-          paddle::operators::math::Pool2dFunctor<
-              DeviceContext, paddle::operators::math::MaxPool<T>, T>
-              pool2d_forward;
-          paddle::operators::math::MaxPool<T> pool_process;
-          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         out);
-
-        } else if (pooling_type == "avg") {
-          paddle::operators::math::Pool2dFunctor<
-              DeviceContext, paddle::operators::math::AvgPool<T>, T>
-              pool2d_forward;
-          paddle::operators::math::AvgPool<T> pool_process;
-          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         out);
-        }
-      } break;
-      case 3: {
-        if (pooling_type == "max") {
-          paddle::operators::math::Pool3dFunctor<
-              DeviceContext, paddle::operators::math::MaxPool<T>, T>
-              pool3d_forward;
-          paddle::operators::math::MaxPool<T> pool_process;
-          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         out);
-        } else if (pooling_type == "avg") {
-          paddle::operators::math::Pool3dFunctor<
-              DeviceContext, paddle::operators::math::AvgPool<T>, T>
-              pool3d_forward;
-          paddle::operators::math::AvgPool<T> pool_process;
-          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         out);
-        }
-      } break;
-      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PoolGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    const Tensor* out = context.Input<Tensor>("Out");
-    const Tensor* out_grad =
-        context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-
-    std::string pooling_type = context.Attr<std::string>("pooling_type");
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-
-    if (context.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
-      }
-    }
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    if (in_x_grad) {
-      in_x_grad->mutable_data<T>(context.GetPlace());
-      paddle::operators::math::SetConstant<DeviceContext, T> set_constant;
-      set_constant(dev_ctx, in_x_grad, 0.0);
-
-      switch (ksize.size()) {
-        case 2: {
-          if (pooling_type == "max") {
-            paddle::operators::math::MaxPool2dGradFunctor<DeviceContext, T>
-                pool2d_backward;
-            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, in_x_grad);
-          } else if (pooling_type == "avg") {
-            paddle::operators::math::Pool2dGradFunctor<
-                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
-                pool2d_backward;
-            paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, pool_process, in_x_grad);
-          }
-        } break;
-        case 3: {
-          if (pooling_type == "max") {
-            paddle::operators::math::MaxPool3dGradFunctor<DeviceContext, T>
-                pool3d_backward;
-            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, in_x_grad);
-          } else if (pooling_type == "avg") {
-            paddle::operators::math::Pool3dGradFunctor<
-                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
-                pool3d_backward;
-            paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, pool_process, in_x_grad);
-          }
-        } break;
-        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
deleted file mode 100644
index 1d31d813af4ec4eb829b906fa9add38cc71d54f3..0000000000000000000000000000000000000000
--- a/paddle/operators/pool_with_index_op.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/pool_with_index_op.h"
-
-namespace paddle {
-namespace operators {
-
-inline int OutputSizeMaxPool(int input_size, int filter_size, int padding,
-                             int stride) {
-  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
-  return output_size;
-}
-
-class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of Pooling should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of Pooling should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Mask"),
-                   "Output(Mask) of Pooling should not be null.");
-
-    auto in_x_dims = ctx->GetInputDim("X");
-
-    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-
-    PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
-                   "Pooling intput should be 4-D or 5-D tensor.");
-
-    if (ctx->Attrs().Get<bool>("global_pooling")) {
-      ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x_dims[i + 2]);
-      }
-    }
-
-    PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
-                   "Input size and pooling size should be consistent.");
-    PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
-                      "Strides size and pooling size should be the same.");
-    PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
-                      "Paddings size and pooling size should be the same.");
-
-    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      output_shape.push_back(OutputSizeMaxPool(in_x_dims[i + 2], ksize[i],
-                                               paddings[i], strides[i]));
-    }
-    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-    ctx->SetOutputDim("Mask", framework::make_ddim(output_shape));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
-  }
-};
-
-class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Mask"), "Input(Mask) must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Input(X@GRAD) should not be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
-  }
-};
-
-class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  MaxPool2dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor of pooling operator. "
-        "The format of input tensor is NCHW, where N is batch size, C is the "
-        "number of channels, H is the height of the image, "
-        "and W is the width of the image.");
-    AddOutput("Out",
-              "(Tensor) The output tensor of pooling operator. "
-              "The format of output tensor is also NCHW, "
-              "where N is batch size, C is "
-              "the number of channels, H is the height of the image "
-              "and W is the width of the image.");
-    AddOutput("Mask",
-              "(Tensor) The Mask tensor of pooling operator."
-              "The format of output tensor is also NCHW, "
-              "where N is batch size, C is the number of channels, "
-              "H is the height of the image, "
-              "and W is the width of the image. "
-              "It represents the index in the current feature map.");
-
-    AddAttr<std::vector<int>>("ksize",
-                              "(vector<int>) The pooling window size(height, "
-                              "width) of pooling operator. "
-                              "If global_pooling = true, ksize and paddings "
-                              "will be ignored.");  // TODO(Chengduo): Add
-                                                    // checker. (Currently,
-    // TypedAttrChecker don't support vector type.)
-    AddAttr<bool>(
-        "global_pooling",
-        "(bool, default:false) Whether to use the global pooling. "
-        "If global_pooling = true, ksize and paddings will be ignored.")
-        .SetDefault(false);
-    AddAttr<std::vector<int>>("strides",
-                              "(vector<int>, default {1, 1}), strides(height, "
-                              "width) of pooling operator.")
-        .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
-    // TypedAttrChecker don't support vector type.)
-    AddAttr<std::vector<int>>(
-        "paddings",
-        "(vector<int>, default:{0, 0}), paddings(height, width) of pooling "
-        "operator. "
-        "If global_pooling = true, paddings and will be ignored.")
-        .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
-    // TypedAttrChecker don't support vector type.)
-
-    AddComment(R"DOC(
-MaxPool2d Operator.
-
-The maxPooling2d with index operation calculates the output and the mask
-based on the input, ksize, strides, and paddings parameters. Input(X) and
-output(Out, Mask) are in NCHW format, where N is batch size, C is the
-number of channels, H is the height of the feature, 
-and W is the width of the feature.
-Parameters(ksize, strides, paddings) are two elements.
-These two elements represent height and width, respectively.
-The input(X) size and output(Out, Mask) size may be different.
-
-Example:
-  Input:
-       X shape: $(N, C, H_{in}, W_{in})$
-  Output:
-       Out shape: $(N, C, H_{out}, W_{out})$
-       Mask shape: $(N, C, H_{out}, W_{out})$
-  Where
-       $$
-       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
-       $$
-
-)DOC");
-  }
-};
-
-class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  MaxPool3dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(Tensor) The input tensor of pooling operator. "
-             "The format of input tensor is NCDHW, where N is batch size, C is "
-             "the number of channels, and D, H and W are the depth, height and "
-             "width of "
-             "the image, respectively");
-    AddOutput("Out",
-              "(Tensor) The output tensor of pooling operator. "
-              "The format of output tensor is also NCDHW, "
-              "where N is the batch size, C is the number of channels, "
-              "and D, H and W are the depth, height and "
-              "width of the image, respectively.");
-    AddOutput("Mask",
-              "(Tensor) The Mask tensor of pooling operator. "
-              "The format of output tensor is also NCDHW, "
-              "where N is the batch size, C is the number of channels, and "
-              "D, H and W are the depth, height and width "
-              "of the image, respectively. "
-              "It represents the index in the current feature map.");
-
-    AddAttr<std::vector<int>>("ksize",
-                              "(vector<int>) The pooling window size(depth, "
-                              "height, width) of pooling operator. "
-                              "If global_pooling = true, ksize and paddings "
-                              "will be ignored.");  // TODO(Chengduo): Add
-                                                    // checker. (Currently,
-    // TypedAttrChecker don't support vector type.)
-    AddAttr<bool>(
-        "global_pooling",
-        "(bool, default false) Whether to use the global pooling. "
-        "If global_pooling = true, ksize and paddings will be ignored.")
-        .SetDefault(false);
-    AddAttr<std::vector<int>>("strides",
-                              "(vector<int>, default {1,1,1}), strides(depth, "
-                              "height, width) of pooling operator.")
-        .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
-    // TypedAttrChecker don't support vector type.)
-    AddAttr<std::vector<int>>(
-        "paddings",
-        "(vector, default {0,0,0}), paddings(depth, "
-        "height, width) of pooling operator. "
-        "If global_pooling = true, paddings and ksize will be ignored.")
-        .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
-    // TypedAttrChecker don't support vector type.)
-
-    AddComment(R"DOC(
-MaxPool3d Operator.
-
-The maxpooling3d with index operation calculates the output and the mask
-based on the input and ksize, strides, paddings parameters.
-Input(X) and output(Out, Mask) are in NCDHW format, where N is batch
-size, C is the number of channels, and D, H and W are the depth, height and
-width of the feature, respectively. 
-Parameters(ksize, strides, paddings) are three elements.
-These three elements represent depth, height and width, respectively.
-The input(X) size and output(Out, Mask) size may be different.
-
-Example:
-  Input:
-       X shape: $(N, C, D_{in}, H_{in}, W_{in})$
-  Output:
-       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
-       Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$
-  Where
-       $$
-       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
-       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
-       $$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
-            ops::MaxPool2dWithIndexOpMaker, max_pool2d_with_index_grad,
-            ops::MaxPoolWithIndexOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, double,
-                                int>);
-REGISTER_OP_CPU_KERNEL(
-    max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
-                                    int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
-                                    int>)
-
-REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
-            ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad,
-            ops::MaxPoolWithIndexOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, double,
-                                int>);
-REGISTER_OP_CPU_KERNEL(
-    max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
-                                    int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
-                                    int>)
diff --git a/paddle/operators/pool_with_index_op.cu.cc b/paddle/operators/pool_with_index_op.cu.cc
deleted file mode 100644
index 4c9804da639e3ad44f90963b53948cd8b755a6ac..0000000000000000000000000000000000000000
--- a/paddle/operators/pool_with_index_op.cu.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/pool_with_index_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, float,
-                                int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, double,
-                                int>);
-REGISTER_OP_CUDA_KERNEL(
-    max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
-                                    int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
-                                    int>)
-
-REGISTER_OP_CUDA_KERNEL(
-    max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, float,
-                                int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, double,
-                                int>);
-REGISTER_OP_CUDA_KERNEL(
-    max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
-                                    int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
-                                    int>)
diff --git a/paddle/operators/pool_with_index_op.h b/paddle/operators/pool_with_index_op.h
deleted file mode 100644
index 4f4087d1dd36d6e91cdd9a9253dd72a71735e136..0000000000000000000000000000000000000000
--- a/paddle/operators/pool_with_index_op.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/pooling.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-    Tensor* mask = context.Output<Tensor>("Mask");
-
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    if (context.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
-      }
-    }
-
-    switch (ksize.size()) {
-      case 2: {
-        paddle::operators::math::MaxPool2dWithIndexFunctor<DeviceContext, T1,
-                                                           T2>
-            pool2d_forward;
-        pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask);
-      } break;
-      case 3: {
-        paddle::operators::math::MaxPool3dWithIndexFunctor<DeviceContext, T1,
-                                                           T2>
-            pool3d_forward;
-        pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask);
-      } break;
-      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* mask = context.Input<Tensor>("Mask");
-    const Tensor* out_grad =
-        context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x_grad->dims()[i + 2]);
-      }
-    }
-
-    if (in_x_grad) {
-      in_x_grad->mutable_data<T1>(context.GetPlace());
-      auto& device_ctx = context.template device_context<DeviceContext>();
-      math::set_constant(device_ctx, in_x_grad, 0);
-
-      switch (ksize.size()) {
-        case 2: {
-          paddle::operators::math::MaxPool2dWithIndexGradFunctor<DeviceContext,
-                                                                 T1, T2>
-              pool2d_backward;
-          pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides,
-                          paddings, in_x_grad);
-        } break;
-        case 3: {
-          paddle::operators::math::MaxPool3dWithIndexGradFunctor<DeviceContext,
-                                                                 T1, T2>
-              pool3d_backward;
-          pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
-                          paddings, in_x_grad);
-        } break;
-        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/positive_negative_pair_op.cc b/paddle/operators/positive_negative_pair_op.cc
deleted file mode 100644
index 5aa5167dbb83c9caf2e754859938e51700d8ec3f..0000000000000000000000000000000000000000
--- a/paddle/operators/positive_negative_pair_op.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/positive_negative_pair_op.h"
-
-namespace paddle {
-namespace operators {
-
-class PositiveNegativePairOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("Score"),
-        "Input(Score) of PositiveNegativePairOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("Label"),
-        "Input(Label) of PositiveNegativePairOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("QueryID"),
-        "Input(QueryID) of PositiveNegativePairOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("PositivePair"),
-        "Output(PositivePair) of PositiveNegativePairOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("NegativePair"),
-        "Output(NegativePair) of PositiveNegativePairOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("NeutralPair"),
-        "Output(NeutralPair) of PositiveNegativePairOp should not be null.");
-    auto scalar_dim = framework::make_ddim({1});
-    if (ctx->HasInput("AccumulatePositivePair") ||
-        ctx->HasInput("AccumulateNegativePair") ||
-        ctx->HasInput("AccumulateNeutralPair")) {
-      PADDLE_ENFORCE(ctx->HasInput("AccumulatePositivePair") &&
-                         ctx->HasInput("AccumulateNegativePair") &&
-                         ctx->HasInput("AccumulateNeutralPair"),
-                     "All optional inputs(AccumulatePositivePair, "
-                     "AccumulateNegativePair, AccumulateNeutralPair) of "
-                     "PositiveNegativePairOp are required if one of them is "
-                     "specified.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulatePositivePair"), scalar_dim,
-                        "Shape of AccumulatePositivePair should be {1}.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulateNegativePair"), scalar_dim,
-                        "Shape of AccumulateNegativePair should be {1}.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulateNeutralPair"), scalar_dim,
-                        "Shape of AccumulateNeutralPair should be {1}.");
-    }
-
-    auto score_dim = ctx->GetInputDim("Score");
-    auto label_dim = ctx->GetInputDim("Label");
-    auto query_dim = ctx->GetInputDim("QueryID");
-    PADDLE_ENFORCE_EQ(score_dim.size(), 2, "Score should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(label_dim.size(), 2, "Label should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(
-        label_dim[0], score_dim[0],
-        "Tensor Score and Label should have the same height (batch size).");
-    PADDLE_ENFORCE_EQ(label_dim[1], 1,
-                      "The width of Label should be 1, i.e. each item should "
-                      "have a scalar label.");
-    PADDLE_ENFORCE(query_dim == label_dim,
-                   "QueryID should have the same shape as Label.");
-    if (ctx->HasInput("Weight")) {
-      PADDLE_ENFORCE(ctx->GetInputDim("Weight") == label_dim,
-                     "Weight should have the same shape as Label.");
-    }
-    int column = ctx->Attrs().Get<int>("column");
-    auto depth = score_dim[1];
-    PADDLE_ENFORCE(column < depth && column >= -depth,
-                   "Attribute column should be in the range of [-%l, %l)",
-                   depth, depth);
-
-    ctx->SetOutputDim("PositivePair", scalar_dim);
-    ctx->SetOutputDim("NegativePair", scalar_dim);
-    ctx->SetOutputDim("NeutralPair", scalar_dim);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Score")->type()),
-        ctx.device_context());
-  }
-};
-
-class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  PositiveNegativePairOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Score",
-             "(Tensor, float) Model Score on an item (with "
-             "respect to QueryID). It's a 2-D tensor with shape [batch_size, "
-             "depth], where the column specified by the attribute \"column\" "
-             "is used as item score.");
-    AddInput("Label",
-             "(Tensor, float) Label of an item (with repsect to "
-             "QueryId). It's a 2-D tensor with shape [batch_size, 1].");
-    AddInput("QueryID",
-             "(Tensor, int64) Query ID that indicates the context. Its shape "
-             "should be the same as Label.");
-    AddInput(
-        "AccumulatePositivePair",
-        "(float) Optional. The accumulated number of positive pairs over a "
-        "stream of data. If provided, the output PositivePair will be "
-        "initialized with this number rather than 0. it won't be modified "
-        "in place.")
-        .AsDispensable();
-    AddInput(
-        "AccumulateNegativePair",
-        "(float) Optional. The accumulated number of negative pairs over a "
-        "stream of data. If provided, the output NegativePair will be "
-        "initialized with this number rather than 0. it won't be modified "
-        "in place.")
-        .AsDispensable();
-    AddInput("AccumulateNeutralPair",
-             "(float) Optional. The accumulated number of neutral pairs over a "
-             "stream of data. If provided, the output NeutralPair will be "
-             "initialized with this number rather than 0. it won't be modified "
-             "in place.")
-        .AsDispensable();
-    AddInput("Weight",
-             "(float) Optional. Weight of current item. If specified, its "
-             "shape should be the same as Label, and the meaning of the output "
-             "changes from numbers of pairs to the total sum of pairs' "
-             "weights. Weight of a pair of items is the average of their "
-             "weights.")
-        .AsDispensable();
-    AddOutput("PositivePair",
-              "(float) Number of positive pairs, i.e. the pairs of "
-              "items that are ranked correctly.");
-    AddOutput("NegativePair",
-              "(float) Number of negative pairs, i.e. the pairs of "
-              "items that are ranked incorrectly.");
-    AddOutput("NeutralPair",
-              "(float) Number of neutral pairs, i.e. the pairs of items "
-              "that have the same score.")
-        .AsDispensable();
-    AddAttr<int>(
-        "column",
-        "(int, default -1) The column position of Score used to rank items in "
-        "descending order. It must be in the range of [-rank(Score), "
-        "rank(Score)). "
-        "If `dim < 0`, the dim to reduce is `rank + dim`. "
-        "Noting that reducing on the first dim will make the LoD info lost.")
-        .SetDefault(0);
-    AddComment(R"DOC(
-PositiveNegativePairOp can be used to evaluate Learning To Rank(LTR) model's
-performance.
-
-Within some context, e.g. the "query", a LTR model generates scores for a list
-of items, which gives a partial order of the items. PositiveNegativePairOp
-takes a list of reference rank order (Input("Label")) and the model generated
-scores (Input(Score)) as inputs and counts the pairs that ranked correctly
-and incorrectly.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(positive_negative_pair,
-                             ops::PositiveNegativePairOp,
-                             ops::PositiveNegativePairOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    positive_negative_pair,
-    ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, float>,
-    ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/positive_negative_pair_op.h b/paddle/operators/positive_negative_pair_op.h
deleted file mode 100644
index 977e59b7d2f771fc4c3412f0092f1eba92ef22da..0000000000000000000000000000000000000000
--- a/paddle/operators/positive_negative_pair_op.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <unordered_map>
-#include <vector>
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename DeviceContext, typename T>
-class PositiveNegativePairKernel : public framework::OpKernel<T> {
- public:
-  struct PredictionResult {
-    PredictionResult(T score, T label, T weight)
-        : score(score), label(label), weight(weight) {}
-    T score;
-    T label;
-    T weight;
-  };
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto score_t = context.Input<Tensor>("Score");
-    auto label_t = context.Input<Tensor>("Label");
-    auto query_t = context.Input<Tensor>("QueryID");
-    auto acc_positive_t = context.Input<Tensor>("AccumulatePositivePair");
-    auto acc_negative_t = context.Input<Tensor>("AccumulateNegativePair");
-    auto acc_neutral_t = context.Input<Tensor>("AccumulateNeutralPair");
-    auto positive_t = context.Output<Tensor>("PositivePair");
-    auto negative_t = context.Output<Tensor>("NegativePair");
-    auto neutral_t = context.Output<Tensor>("NeutralPair");
-    auto weight_t = context.Input<Tensor>("Weight");
-
-    auto score = score_t->data<T>();
-    auto label = label_t->data<T>();
-    auto query = query_t->data<int64_t>();
-    const T* weight = nullptr;
-    if (weight_t != nullptr) {
-      weight = weight_t->data<T>();
-    }
-    T* positive = positive_t->mutable_data<T>(context.GetPlace());
-    T* negative = negative_t->mutable_data<T>(context.GetPlace());
-    T* neutral = neutral_t->mutable_data<T>(context.GetPlace());
-
-    auto score_dim = score_t->dims();
-    auto batch_size = score_dim[0];
-    auto width = score_dim[1];
-    auto column = context.Attr<int32_t>("column");
-    if (column < 0) {
-      column += width;
-    }
-
-    // construct document instances for each query: Query => List[<score#0,
-    // label#0, weight#0>, ...]
-    std::unordered_map<int64_t, std::vector<PredictionResult>> predictions;
-    for (auto i = 0; i < batch_size; ++i) {
-      if (predictions.find(query[i]) == predictions.end()) {
-        predictions.emplace(
-            std::make_pair(query[i], std::vector<PredictionResult>()));
-      }
-      predictions[query[i]].emplace_back(score[i * width + column], label[i],
-                                         weight_t != nullptr ? weight[i] : 1.0);
-    }
-
-    // for each query, accumulate pair counts
-    T pos = 0, neg = 0, neu = 0;
-    if (acc_positive_t != nullptr && acc_negative_t != nullptr &&
-        acc_neutral_t != nullptr) {
-      pos = acc_positive_t->data<T>()[0];
-      neg = acc_negative_t->data<T>()[0];
-      neu = acc_neutral_t->data<T>()[0];
-    }
-    auto evaluate_one_list = [&pos, &neg,
-                              &neu](std::vector<PredictionResult> vec) {
-      for (auto ite1 = vec.begin(); ite1 != vec.end(); ++ite1) {
-        for (auto ite2 = ite1 + 1; ite2 != vec.end(); ++ite2) {
-          if (ite1->label == ite2->label) {  // labels are equal, ignore.
-            continue;
-          }
-          T w = (ite1->weight + ite2->weight) * 0.5;
-          if (ite1->score == ite2->score) {
-            neu += w;
-          }
-          (ite1->score - ite2->score) * (ite1->label - ite2->label) > 0.0
-              ? pos += w
-              : neg += w;
-        }
-      }
-    };
-    for (auto prediction : predictions) {
-      evaluate_one_list(prediction.second);
-    }
-    *positive = pos;
-    *negative = neg;
-    *neutral = neu;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/precision_recall_op.cc b/paddle/operators/precision_recall_op.cc
deleted file mode 100644
index f1598d53cae2a58acd0207a20938b4f744ba0efe..0000000000000000000000000000000000000000
--- a/paddle/operators/precision_recall_op.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/precision_recall_op.h"
-
-namespace paddle {
-namespace operators {
-
-class PrecisionRecallOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("MaxProbs"),
-                   "Input(MaxProbs) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Indices"),
-                   "Input(Indices) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
-                   "Input(Labels) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchMetrics"),
-                   "Output(BatchMetrics) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("AccumMetrics"),
-                   "Output(AccumMetrics) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("AccumStatesInfo"),
-                   "Output(AccumStatesInfo) should not be null.");
-
-    int64_t cls_num =
-        static_cast<int64_t>(ctx->Attrs().Get<int>("class_number"));
-    auto max_probs_dims = ctx->GetInputDim("MaxProbs");
-    auto labels_dims = ctx->GetInputDim("Labels");
-
-    PADDLE_ENFORCE_EQ(max_probs_dims[1], 1,
-                      "Each instance contains one max probability, so the "
-                      "shape of Input(MaxProbs) should be [batch_size, 1].");
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Indices"), max_probs_dims,
-                      "The shape of Input(Indices) should be [batch_size, 1].");
-    PADDLE_ENFORCE_EQ(max_probs_dims[0], labels_dims[0],
-                      "The 1st dimension of Input(MaxProbs) and "
-                      "Input(Labels) both are batch_size and the shape should "
-                      "be the same.");
-    PADDLE_ENFORCE_EQ(labels_dims[1], 1,
-                      "The 2nd dimension of Input(Labels) contains instance "
-                      "label and the shape should be equal to 1.");
-    if (ctx->HasInput("Weights")) {
-      auto weights_dims = ctx->GetInputDim("Weights");
-      PADDLE_ENFORCE_EQ(weights_dims,
-                        framework::make_ddim({max_probs_dims[0], 1}),
-                        "The shape of Input(Weights) should be "
-                        "[batch_size, 1].");
-    }
-    if (ctx->HasInput("StatesInfo")) {
-      auto states_dims = ctx->GetInputDim("StatesInfo");
-      PADDLE_ENFORCE_EQ(states_dims, framework::make_ddim({cls_num, 4}),
-                        "The shape of Input(StatesInfo) should be "
-                        "[class_number, 4].");
-    }
-
-    // Layouts of BatchMetrics and AccumMetrics both are:
-    // [
-    //  macro average precision, macro average recall, macro average F1 score,
-    //  micro average precision, micro average recall, micro average F1 score
-    // ]
-    ctx->SetOutputDim("BatchMetrics", {6});
-    ctx->SetOutputDim("AccumMetrics", {6});
-    // Shape of AccumStatesInfo is [class_number, 4]
-    // The layout of each row is:
-    // [ TP, FP, TN, FN ]
-    ctx->SetOutputDim("AccumStatesInfo", {cls_num, 4});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("MaxProbs")->type()),
-        ctx.device_context());
-  }
-};
-
-class PrecisionRecallOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  PrecisionRecallOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("MaxProbs",
-             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
-             "where N is the batch size. Each row contains the max probability "
-             "of an instance which computed by the previous top_k (k=1) "
-             "operator.");
-    AddInput("Indices",
-             "(Tensor, default Tensor<int>) A 2-D tensor with shape N x 1, "
-             "where N is the batch size. Each row contains the corresponding "
-             "index which computed by the previous top_k (k=1) operator.");
-    AddInput("Labels",
-             "(Tensor, default Tensor<int>) A 2-D tensor with shape N x 1, "
-             "where N is the batch size. Each element is a label and the "
-             "value should be in [0, class_number - 1].");
-    AddInput("Weights",
-             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
-             "where N is the batch size. This input is optional. If provided, "
-             "weight of instance would be considered when computing metrics.")
-        .AsDispensable();
-    AddInput("StatesInfo",
-             "(Tensor, default Tensor<int>) A 2-D tensor with shape D x 4, "
-             "where D is the number of classes. This input is optional. If "
-             "provided, current state will be accumulated to this state and "
-             "the accumulation state will be the output state.")
-        .AsDispensable();
-    AddOutput("BatchMetrics",
-              "(Tensor, default Tensor<float>) A 1-D tensor with shape {6}. "
-              "This output tensor contains metrics for current batch data. "
-              "The layout is [macro average precision, macro average recall, "
-              "macro f1 score, micro average precision, micro average recall, "
-              "micro f1 score].");
-    AddOutput("AccumMetrics",
-              "(Tensor, default Tensor<float>) A 1-D tensor with shape {6}. "
-              "This output tensor contains metrics for accumulated data. "
-              "The layout is [macro average precision, macro average recall, "
-              "macro f1 score, micro average precision, micro average recall, "
-              "micro f1 score].");
-    AddOutput("AccumStatesInfo",
-              "(Tensor, default Tensor<float>) A 2-D tensor with shape D x 4, "
-              "where D is equal to class number. This output tensor contains "
-              "accumulated state variables used to compute metrics. The layout "
-              "for each class is [true positives, false positives, "
-              "true negatives, false negatives].");
-    AddAttr<int>("class_number", "(int) Number of classes to be evaluated.");
-    AddComment(R"DOC(
-Precision Recall Operator.
-
-When given Input(Indices) and Input(Labels), this operator can be used
-to compute various metrics including:
-1. macro average precision
-2. macro average recall
-3. macro f1 score
-4. micro average precision
-5. micro average recall
-6. micro f1 score
-
-To compute the above metrics, we need to do statistics for true positives,
-false positives and false negatives. Here the count of true negatives is not
-necessary, but counting it may provide potential usage and the cost is
-trivial, so the operator also provides the count of true negatives.
-
-We define state as a 2-D tensor with shape [class_number, 4]. Each row of a
-state contains statistic variables for corresponding class. Layout of each row
-is: TP(true positives), FP(false positives), TN(true negatives),
-FN(false negatives). If Input(Weights) is provided, TP, FP, TN, FN will be
-calculated by given weight instead of the instance count.
-
-This operator also supports metrics computing for cross-batch situation. To
-achieve this, Input(StatesInfo) should be provided. State of current batch
-data will be accumulated to Input(StatesInfo) and Output(AccumStatesInfo)
-is the accumulation state.
-
-Output(BatchMetrics) is metrics of current batch data while
-Output(AccumStatesInfo) is metrics of accumulation data.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(precision_recall, ops::PrecisionRecallOp,
-                             ops::PrecisionRecallOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    precision_recall,
-    ops::PrecisionRecallKernel<paddle::platform::CPUPlace, float>,
-    ops::PrecisionRecallKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/precision_recall_op.h b/paddle/operators/precision_recall_op.h
deleted file mode 100644
index c0d55405a362809f414b8dc3b12ed692f96c24e9..0000000000000000000000000000000000000000
--- a/paddle/operators/precision_recall_op.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-enum StateVariable { TP = 0, FP, TN, FN };
-
-template <typename DeviceContext, typename T>
-class PrecisionRecallKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in0 = ctx.Input<Tensor>("Indices");
-    auto* in1 = ctx.Input<Tensor>("Labels");
-    auto* in2 = ctx.Input<Tensor>("Weights");
-    auto* in3 = ctx.Input<Tensor>("StatesInfo");
-    auto* out0 = ctx.Output<Tensor>("BatchMetrics");
-    auto* out1 = ctx.Output<Tensor>("AccumMetrics");
-    auto* out2 = ctx.Output<Tensor>("AccumStatesInfo");
-
-    const int* ids_data = in0->data<int>();
-    const int* labels_data = in1->data<int>();
-    size_t cls_num = static_cast<size_t>(ctx.Attr<int>("class_number"));
-    const T* weights_data = in2 ? in2->data<T>() : nullptr;
-    const T* states_data = in3 ? in3->data<T>() : nullptr;
-    double* batch_metrics_data = out0->mutable_data<double>(ctx.GetPlace());
-    double* accum_metrics_data = out1->mutable_data<double>(ctx.GetPlace());
-    out2->mutable_data<T>(ctx.GetPlace());
-    auto accum_states = EigenMatrix<T>::From(*out2);
-    accum_states.setZero();
-    T* accum_states_data = out2->data<T>();
-
-    size_t sample_num = in0->dims()[0];
-    size_t state_var_num = 4;  // TP FP TN FN
-
-    // get states info for current batch
-    for (size_t i = 0; i < sample_num; ++i) {
-      size_t idx = ids_data[i];
-      size_t label = labels_data[i];
-
-      PADDLE_ENFORCE(idx >= 0 && idx < cls_num,
-                     "Class index of each instance should be in "
-                     "[0, class_number).");
-      PADDLE_ENFORCE(label >= 0 && label < cls_num,
-                     "Label of each instance should be in [0, class_number).");
-
-      T w = weights_data ? weights_data[i] : 1.0;
-      if (idx == label) {
-        accum_states_data[idx * state_var_num + TP] += w;
-        for (size_t j = 0; j < cls_num; ++j) {
-          accum_states_data[j * state_var_num + TN] += w;
-        }
-        accum_states_data[idx * state_var_num + TN] -= w;
-      } else {
-        accum_states_data[label * state_var_num + FN] += w;
-        accum_states_data[idx * state_var_num + FP] += w;
-        for (size_t j = 0; j < cls_num; ++j) {
-          accum_states_data[j * state_var_num + TN] += w;
-        }
-        accum_states_data[idx * state_var_num + TN] -= w;
-        accum_states_data[label * state_var_num + TN] -= w;
-      }
-    }
-
-    ComputeMetrics(accum_states_data, batch_metrics_data, state_var_num,
-                   cls_num);
-
-    if (states_data) {
-      for (size_t i = 0; i < cls_num; ++i) {
-        for (size_t j = 0; j < state_var_num; ++j) {
-          size_t idx = i * state_var_num + j;
-          accum_states_data[idx] += states_data[idx];
-        }
-      }
-    }
-
-    ComputeMetrics(accum_states_data, accum_metrics_data, state_var_num,
-                   cls_num);
-  }
-
-  // expose to be reused
-  static inline T CalcPrecision(T tp_count, T fp_count) {
-    if (tp_count > 0.0 || fp_count > 0.0) {
-      return tp_count / (tp_count + fp_count);
-    }
-    return 1.0;
-  }
-
-  static inline T CalcRecall(T tp_count, T fn_count) {
-    if (tp_count > 0.0 || fn_count > 0.0) {
-      return tp_count / (tp_count + fn_count);
-    }
-    return 1.0;
-  }
-
-  static inline T CalcF1Score(T precision, T recall) {
-    if (precision > 0.0 || recall > 0.0) {
-      return 2 * precision * recall / (precision + recall);
-    }
-    return 0.0;
-  }
-
- protected:
-  void ComputeMetrics(const T* states_data, double* metrics_data,
-                      size_t state_var_num, size_t cls_num) const {
-    T total_tp_count = 0;
-    T total_fp_count = 0;
-    T total_fn_count = 0;
-    T macro_avg_precision = 0.0;
-    T macro_avg_recall = 0.0;
-
-    for (size_t i = 0; i < cls_num; ++i) {
-      T tp_count = states_data[i * state_var_num + TP];
-      T fp_count = states_data[i * state_var_num + FP];
-      T fn_count = states_data[i * state_var_num + FN];
-      total_tp_count += tp_count;
-      total_fp_count += fp_count;
-      total_fn_count += fn_count;
-      macro_avg_precision += CalcPrecision(tp_count, fp_count);
-      macro_avg_recall += CalcRecall(tp_count, fn_count);
-    }
-    macro_avg_precision /= cls_num;
-    macro_avg_recall /= cls_num;
-    T macro_f1_score = CalcF1Score(macro_avg_precision, macro_avg_recall);
-
-    T micro_avg_precision = CalcPrecision(total_tp_count, total_fp_count);
-    T micro_avg_recall = CalcRecall(total_tp_count, total_fn_count);
-    T micro_f1_score = CalcF1Score(micro_avg_precision, micro_avg_recall);
-
-    // fill metrics data
-    metrics_data[0] = macro_avg_precision;
-    metrics_data[1] = macro_avg_recall;
-    metrics_data[2] = macro_f1_score;
-    metrics_data[3] = micro_avg_precision;
-    metrics_data[4] = micro_avg_recall;
-    metrics_data[5] = micro_f1_score;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc
deleted file mode 100644
index ddc21a657024dcc800726475fa6242f8e6576ad1..0000000000000000000000000000000000000000
--- a/paddle/operators/prelu_op.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/prelu_op.h"
-#include "paddle/operators/net_op.h"
-
-namespace paddle {
-namespace operators {
-
-class PReluOp : public framework::OperatorWithKernel {
- public:
-  PReluOp(const std::string &type, const framework::VariableNameMap &inputs,
-          const framework::VariableNameMap &outputs,
-          const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("Alpha"), "Input(Alpha) should not be null");
-    PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1,
-                   "Size of weight Alpha must be one.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  PReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input tensor of prelu operator.");
-    AddInput("Alpha", "The alpha weight of prelu operator.");
-    AddOutput("Out", "The output tensor of prelu operator.");
-    AddComment(R"DOC(
-PRelu Operator.
-
-The equation is:
-
-$$
-f(x) =
-\begin{cases}
-\alpha * x, \quad  \text{if} \ x < 0 \\
-x,         \qquad  \text{if} \ x >= 0
-\end{cases}
-$$
-
-The input `X` can carry the LoD (Level of Details) information,
-or not. And the output shares the LoD information with input `X`.
-
-)DOC");
-  }
-};
-
-// The operator to calculate gradients of a prelu operator.
-class PReluGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->SetOutputDim(framework::GradVarName("Alpha"),
-                      ctx->GetInputDim("Alpha"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP(prelu, ops::PReluOp, ops::PReluOpMaker, prelu_grad,
-            ops::PReluGradOp);
-REGISTER_OP_CPU_KERNEL(
-    prelu, ops::PReluKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    prelu_grad,
-    ops::PReluGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/prelu_op.cu b/paddle/operators/prelu_op.cu
deleted file mode 100644
index 1718bb5cd65f48eba391023e4374a30e405a164d..0000000000000000000000000000000000000000
--- a/paddle/operators/prelu_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/prelu_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    prelu,
-    paddle::operators::PReluKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(prelu_grad,
-                        paddle::operators::PReluGradKernel<
-                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/prelu_op.h b/paddle/operators/prelu_op.h
deleted file mode 100644
index 56f9a553ec12d5bfa745af63ec0570ad30910628..0000000000000000000000000000000000000000
--- a/paddle/operators/prelu_op.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using platform::Transform;
-
-template <typename T>
-class PReluFunctor {
- public:
-  explicit PReluFunctor(const T* alpha) : alpha_(alpha) {}
-
-  HOSTDEVICE T operator()(const T& x) const {
-    if (x > 0)
-      return x;
-    else
-      return x * (*alpha_);
-  }
-
- private:
-  const T* alpha_;
-};
-
-template <typename DeviceContext, typename T>
-class PReluKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* alpha = context.Input<Tensor>("Alpha");
-    auto* out = context.Output<Tensor>("Out");
-
-    const T* x_ptr = x->data<T>();
-    T* o_ptr = out->mutable_data<T>(context.GetPlace());
-
-    auto* alpha_ptr = alpha->data<T>();
-
-    int numel = x->numel();
-
-    Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), x_ptr,
-          x_ptr + numel, o_ptr, PReluFunctor<T>(alpha_ptr));
-  }
-};
-
-template <typename T>
-class PReluGradFunctor {
- public:
-  explicit PReluGradFunctor(const T* alpha) : alpha_(alpha) {}
-
-  HOSTDEVICE T operator()(const T& out, const T& dout) const {
-    if (out > 0)
-      return dout;
-    else
-      return dout * (*alpha_);
-  }
-
- private:
-  const T* alpha_;
-};
-
-template <typename DeviceContext, typename T>
-class PReluGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
-
-    auto* out = context.Input<Tensor>("Out");
-    auto* alpha = context.Input<Tensor>("Alpha");
-    auto* alpha_ptr = alpha->data<T>();
-
-    T* dx_ptr = dx->mutable_data<T>(context.GetPlace());
-    const T* dout_ptr = dout->data<T>();
-    const T* out_ptr = out->data<T>();
-    int numel = dx->numel();
-
-    Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), out_ptr,
-          out_ptr + numel, dout_ptr, dx_ptr, PReluGradFunctor<T>(alpha_ptr));
-
-    // TODO(Zhuoyuan): add dalpha upgrade when GPU kernels ready
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/print_op.cc b/paddle/operators/print_op.cc
deleted file mode 100644
index 8b233d64c904a8870212af33c5839cfc555b5dc8..0000000000000000000000000000000000000000
--- a/paddle/operators/print_op.cc
+++ /dev/null
@@ -1,283 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include <algorithm>
-#include <ctime>
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/variable.h"
-
-namespace paddle {
-namespace operators {
-
-#define CLOG std::cout
-
-const std::string kForward = "FORWARD";
-const std::string kBackward = "BACKWARD";
-const std::string kBoth = "BOTH";
-
-struct Formater {
-  std::string message;
-  std::string name;
-  std::vector<int> dims;
-  std::type_index dtype{typeid(char)};
-  framework::LoD lod;
-  int summarize;
-  void* data{nullptr};
-
-  void operator()(size_t size) {
-    PrintMessage();
-    PrintName();
-    PrintDims();
-    PrintDtype();
-    PrintLod();
-    PrintData(size);
-  }
-
- private:
-  void PrintMessage() { CLOG << std::time(nullptr) << "\t" << message; }
-  void PrintName() {
-    if (!name.empty()) {
-      CLOG << "Tensor[" << name << "]" << std::endl;
-    }
-  }
-  void PrintDims() {
-    if (!dims.empty()) {
-      CLOG << "\tshape: [";
-      for (auto i : dims) {
-        CLOG << i << ",";
-      }
-      CLOG << "]" << std::endl;
-    }
-  }
-  void PrintDtype() {
-    if (dtype.hash_code() != typeid(char).hash_code()) {
-      CLOG << "\tdtype: " << dtype.name() << std::endl;
-    }
-  }
-  void PrintLod() {
-    if (!lod.empty()) {
-      CLOG << "\tLoD: [";
-      for (auto level : lod) {
-        CLOG << "[ ";
-        for (auto i : level) {
-          CLOG << i << ",";
-        }
-        CLOG << " ]";
-      }
-      CLOG << "]" << std::endl;
-    }
-  }
-
-  void PrintData(size_t size) {
-    PADDLE_ENFORCE_NOT_NULL(data);
-    // print float
-    if (dtype.hash_code() == typeid(float).hash_code()) {
-      Display<float>(size);
-    }
-    if (dtype.hash_code() == typeid(double).hash_code()) {
-      Display<double>(size);
-    }
-    if (dtype.hash_code() == typeid(int).hash_code()) {
-      Display<int>(size);
-    }
-    if (dtype.hash_code() == typeid(int64_t).hash_code()) {
-      Display<int64_t>(size);
-    }
-  }
-
-  template <typename T>
-  void Display(size_t size) {
-    auto* d = (T*)data;
-    CLOG << "\tdata: ";
-    if (summarize != -1) {
-      summarize = std::min(size, (size_t)summarize);
-      for (int i = 0; i < summarize; i++) {
-        CLOG << d[i] << ",";
-      }
-    } else {
-      for (size_t i = 0; i < size; i++) {
-        CLOG << d[i] << ",";
-      }
-    }
-    CLOG << std::endl;
-  }
-};
-
-// TODO(ChunweiYan) there should be some other printers for TensorArray
-class TensorPrintOp : public framework::OperatorBase {
- public:
-  TensorPrintOp(const std::string& type,
-                const framework::VariableNameMap& inputs,
-                const framework::VariableNameMap& outputs,
-                const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  TensorPrintOp(const TensorPrintOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    PADDLE_THROW("Not implemented.");
-  }
-
-  void Run(const framework::Scope& scope,
-           const platform::Place& place) const override {
-    const framework::Variable* in_var_ptr = nullptr;
-    std::string phase = kForward;
-    std::string printed_var_name = "";
-
-    auto& inputs = Inputs();
-    if (inputs.find("In") != inputs.end() && !Inputs("In").empty()) {
-      in_var_ptr = scope.FindVar(Input("In"));
-      printed_var_name = Inputs("In").front();
-    } else if (inputs.find("In@GRAD") != inputs.end() &&
-               !Inputs("In@GRAD").empty()) {
-      in_var_ptr = scope.FindVar(Input("In@GRAD"));
-      printed_var_name = Inputs("In@GRAD").front();
-      phase = kBackward;
-    } else {
-      PADDLE_THROW("Unknown phase, should be forward or backward.");
-    }
-
-    PADDLE_ENFORCE_NOT_NULL(in_var_ptr);
-
-    auto& in_tensor = in_var_ptr->Get<framework::LoDTensor>();
-    auto* out_var_ptr = scope.FindVar(Output("Out"));
-    auto& out_tensor = *out_var_ptr->GetMutable<framework::LoDTensor>();
-
-    // Just copy data from input tensor to output tensor
-    // output tensor share same memory with input tensor
-    out_tensor.ShareDataWith(in_tensor);
-    out_tensor.set_lod(in_tensor.lod());
-
-    std::string print_phase = Attr<std::string>("print_phase");
-    if (print_phase != phase && print_phase != kBoth) {
-      return;
-    }
-
-    int first_n = Attr<int>("first_n");
-    if (first_n > 0 && ++times_ > first_n) return;
-
-    framework::LoDTensor printed_tensor;
-    printed_tensor.set_lod(in_tensor.lod());
-    printed_tensor.Resize(in_tensor.dims());
-
-    if (platform::is_cpu_place(in_tensor.place())) {
-      printed_tensor.ShareDataWith(in_tensor);
-    } else {
-      // copy data to cpu to print
-      platform::CPUPlace place;
-      framework::Copy(in_tensor, place, &printed_tensor);
-    }
-
-    Formater formater;
-    if (Attr<bool>("print_tensor_name")) {
-      formater.name = printed_var_name;
-    }
-    if (Attr<bool>("print_tensor_type")) {
-      formater.dtype = printed_tensor.type();
-    }
-    if (Attr<bool>("print_tensor_shape")) {
-      auto& dims = printed_tensor.dims();
-      formater.dims.resize(dims.size());
-      for (int i = 0; i < dims.size(); ++i) formater.dims[i] = dims[i];
-    }
-    if (Attr<bool>("print_tensor_lod")) {
-      formater.lod = printed_tensor.lod();
-    }
-    formater.summarize = Attr<int>("summarize");
-    formater.data = (void*)printed_tensor.data<void>();
-    formater(printed_tensor.numel());
-  }
-
- private:
-  mutable int times_{0};
-};
-
-class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  PrintOpProtoAndCheckMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("In", "Input tensor to be displayed.");
-    AddAttr<int>("first_n", "Only log `first_n` number of times.");
-    AddAttr<std::string>("message", "A string message to print as a prefix.");
-    AddAttr<int>("summarize", "Number of elements printed.");
-    AddAttr<bool>("print_tensor_name", "Whether to print the tensor name.");
-    AddAttr<bool>("print_tensor_type", "Whether to print the tensor's dtype.");
-    AddAttr<bool>("print_tensor_shape", "Whether to print the tensor's shape.");
-    AddAttr<bool>("print_tensor_lod", "Whether to print the tensor's lod.");
-    AddAttr<std::string>(
-        "print_phase",
-        "(string, default 'BOTH') Which phase to display including 'FORWARD' "
-        "'BACKWARD' and 'BOTH'.")
-        .SetDefault(kBoth)
-        .InEnum({kForward, kBackward, kBoth});
-    AddOutput("Out", "Output tensor with same data as input tensor.");
-    AddComment(R"DOC(
-Creates a print op that will print when a tensor is accessed.
-
-Wraps the tensor passed in so that whenever that a tensor is accessed,
-the message `message` is printed, along with the current value of the
-tensor `t`.)DOC");
-  }
-};
-
-class InferShapeForward : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* context) const override {
-    PADDLE_ENFORCE(context->HasInput("In"), "Input(In) should not be null.");
-    context->ShareLoD("In", /*->*/ "Out");
-    context->SetOutputDim("Out", context->GetInputDim("In"));
-  }
-};
-
-class InferShapeBackward : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* context) const override {
-    PADDLE_ENFORCE(context->HasInput("In@GRAD"),
-                   "Input(In@GRAD) should not be null.");
-    context->ShareLoD("In@GRAD", /*->*/ "Out");
-    context->SetOutputDim("Out", context->GetInputDim("In@GRAD"));
-  }
-};
-
-class InferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {}
-};
-
-class PrintOpProtoAndCheckGradOpMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op_desc_ptr = new framework::OpDesc();
-    op_desc_ptr->SetType("print_grad");
-    op_desc_ptr->SetInput("In@GRAD", OutputGrad("Out"));
-    op_desc_ptr->SetOutput("Out", InputGrad("In"));
-    op_desc_ptr->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(print, ops::TensorPrintOp, ops::PrintOpProtoAndCheckMaker,
-                  ops::PrintOpProtoAndCheckGradOpMaker, ops::InferShapeForward,
-                  ops::InferVarType);
-REGISTER_OPERATOR(print_grad, ops::TensorPrintOp, ops::InferShapeBackward);
diff --git a/paddle/operators/prior_box_op.cc b/paddle/operators/prior_box_op.cc
deleted file mode 100644
index 1dc4b288559d0294e5f58cc923ffc78b80604af9..0000000000000000000000000000000000000000
--- a/paddle/operators/prior_box_op.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/prior_box_op.h"
-
-namespace paddle {
-namespace operators {
-
-class PriorBoxOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of PriorBoxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Image"),
-                   "Input(Image) of PriorBoxOp should not be null.");
-
-    auto image_dims = ctx->GetInputDim("Image");
-    auto input_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE(image_dims.size() == 4, "The layout of image is NCHW.");
-    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
-
-    PADDLE_ENFORCE_LT(input_dims[2], image_dims[2],
-                      "The height of input must smaller than image.");
-
-    PADDLE_ENFORCE_LT(input_dims[3], image_dims[3],
-                      "The width of input must smaller than image.");
-
-    auto min_sizes = ctx->Attrs().Get<std::vector<int>>("min_sizes");
-    auto max_sizes = ctx->Attrs().Get<std::vector<int>>("max_sizes");
-    auto variances = ctx->Attrs().Get<std::vector<float>>("variances");
-    auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
-    bool flip = ctx->Attrs().Get<bool>("flip");
-
-    std::vector<float> aspect_ratios_vec;
-    ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec);
-
-    int num_priors = aspect_ratios_vec.size() * min_sizes.size();
-    if (max_sizes.size() > 0) {
-      PADDLE_ENFORCE_EQ(max_sizes.size(), min_sizes.size(),
-                        "The number of min_size and max_size must be equal.");
-      for (size_t i = 0; i < min_sizes.size(); ++i) {
-        PADDLE_ENFORCE_GT(max_sizes[i], min_sizes[i],
-                          "max_size[%d] must be greater than min_size[%d].", i,
-                          i);
-        num_priors += 1;
-      }
-    }
-
-    std::vector<int64_t> dim_vec(4);
-    dim_vec[0] = input_dims[2];
-    dim_vec[1] = input_dims[3];
-    dim_vec[2] = num_priors;
-    dim_vec[3] = 4;
-    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
-    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
-  }
-};
-
-class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  PriorBoxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Input",
-             "(Tensor, default Tensor<float>), "
-             "the input feature data of PriorBoxOp, The layout is NCHW.");
-    AddInput("Image",
-             "(Tensor, default Tensor<float>), "
-             "the input image data of PriorBoxOp, The layout is NCHW.");
-    AddOutput("Boxes",
-              "(Tensor, default Tensor<float>), the output prior boxes of "
-              "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
-              "H is the height of input, W is the width of input, num_priors "
-              "is the box count of each position.");
-    AddOutput("Variances",
-              "(Tensor, default Tensor<float>), the expanded variances of "
-              "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
-              "H is the height of input, W is the width of input, num_priors "
-              "is the box count of each position.");
-
-    AddAttr<std::vector<int>>("min_sizes",
-                              "(vector<int>) List of min sizes "
-                              "of generated prior boxes.")
-        .AddCustomChecker([](const std::vector<int>& min_sizes) {
-          PADDLE_ENFORCE_GT(min_sizes.size(), 0,
-                            "Size of min_sizes must be at least 1.");
-          for (size_t i = 0; i < min_sizes.size(); ++i) {
-            PADDLE_ENFORCE_GT(min_sizes[i], 0,
-                              "min_sizes[%d] must be positive.", i);
-          }
-        });
-    AddAttr<std::vector<int>>(
-        "max_sizes",
-        "(vector<int>) List of max sizes of generated prior boxes.");
-    AddAttr<std::vector<float>>(
-        "aspect_ratios",
-        "(vector<float>) List of aspect ratios of generated prior boxes.");
-
-    AddAttr<std::vector<float>>(
-        "variances",
-        "(vector<float>) List of variances to be encoded in prior boxes.")
-        .AddCustomChecker([](const std::vector<float>& variances) {
-          PADDLE_ENFORCE_EQ(variances.size(), 4,
-                            "Must and only provide 4 variance.");
-          for (size_t i = 0; i < variances.size(); ++i) {
-            PADDLE_ENFORCE_GT(variances[i], 0.0,
-                              "variance[%d] must be greater than 0.", i);
-          }
-        });
-    AddAttr<bool>("flip", "(bool) Whether to flip aspect ratios.")
-        .SetDefault(true);
-    AddAttr<bool>("clip", "(bool) Whether to clip out-of-boundary boxes.")
-        .SetDefault(true);
-
-    AddAttr<float>("step_w",
-                   "Prior boxes step across width, 0 for auto calculation.")
-        .SetDefault(0.0)
-        .AddCustomChecker([](const float& step_w) {
-          PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0.");
-        });
-    AddAttr<float>("step_h",
-                   "Prior boxes step across height, 0 for auto calculation.")
-        .SetDefault(0.0)
-        .AddCustomChecker([](const float& step_h) {
-          PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0.");
-        });
-
-    AddAttr<float>("offset",
-                   "(float) "
-                   "Prior boxes center offset.")
-        .SetDefault(0.5);
-    AddComment(R"DOC(
-Prior box operator
-Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
-Each position of the input produce N prior boxes, N is determined by
- the count of min_sizes, max_sizes and aspect_ratios, The size of the
- box is in range(min_size, max_size) interval, which is generated in
- sequence according to the aspect_ratios.
-
-Please get more information from the following papers:
-https://arxiv.org/abs/1512.02325.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    prior_box, ops::PriorBoxOpKernel<paddle::platform::CPUPlace, float>,
-    ops::PriorBoxOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/prior_box_op.h b/paddle/operators/prior_box_op.h
deleted file mode 100644
index 6b221cb74ebb306b99533f49c824ed6c60144ff2..0000000000000000000000000000000000000000
--- a/paddle/operators/prior_box_op.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
-                               bool flip,
-                               std::vector<float>& output_aspect_ratior) {
-  constexpr float epsilon = 1e-6;
-  output_aspect_ratior.clear();
-  output_aspect_ratior.push_back(1.0f);
-  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
-    float ar = input_aspect_ratior[i];
-    bool already_exist = false;
-    for (size_t j = 0; j < output_aspect_ratior.size(); ++j) {
-      if (fabs(ar - output_aspect_ratior[j]) < epsilon) {
-        already_exist = true;
-        break;
-      }
-    }
-    if (!already_exist) {
-      output_aspect_ratior.push_back(ar);
-      if (flip) {
-        output_aspect_ratior.push_back(1.0f / ar);
-      }
-    }
-  }
-}
-
-template <typename T>
-struct ClipFunctor {
-  HOSTDEVICE inline T operator()(T in) const {
-    return std::min<T>(std::max<T>(in, 0.), 1.);
-  }
-};
-
-template <typename Place, typename T>
-class PriorBoxOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
-    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
-    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
-    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
-
-    auto min_sizes = ctx.Attr<std::vector<int>>("min_sizes");
-    auto max_sizes = ctx.Attr<std::vector<int>>("max_sizes");
-    auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
-    auto variances = ctx.Attr<std::vector<float>>("variances");
-    auto flip = ctx.Attr<bool>("flip");
-    auto clip = ctx.Attr<bool>("clip");
-
-    std::vector<float> aspect_ratios;
-    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
-
-    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
-    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
-    T offset = static_cast<T>(ctx.Attr<float>("offset"));
-
-    auto img_width = image->dims()[3];
-    auto img_height = image->dims()[2];
-
-    auto feature_width = input->dims()[3];
-    auto feature_height = input->dims()[2];
-
-    T step_width, step_height;
-    if (step_w == 0 || step_h == 0) {
-      step_width = static_cast<T>(img_width) / feature_width;
-      step_height = static_cast<T>(img_height) / feature_height;
-    } else {
-      step_width = step_w;
-      step_height = step_h;
-    }
-
-    int num_priors = aspect_ratios.size() * min_sizes.size();
-    if (max_sizes.size() > 0) {
-      num_priors += max_sizes.size();
-    }
-
-    boxes->mutable_data<T>(ctx.GetPlace());
-    vars->mutable_data<T>(ctx.GetPlace());
-
-    T inv_img_width = 1.0 / img_width;
-    T inv_img_height = 1.0 / img_height;
-
-    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
-    for (int h = 0; h < feature_height; ++h) {
-      for (int w = 0; w < feature_width; ++w) {
-        T center_x = (w + offset) * step_width;
-        T center_y = (h + offset) * step_height;
-        T box_width, box_height;
-        int idx = 0;
-        for (size_t s = 0; s < min_sizes.size(); ++s) {
-          int min_size = min_sizes[s];
-          // first prior: aspect_ratio = 1, size = min_size
-          box_width = box_height = min_size;
-          // xmin
-          e_boxes(h, w, idx, 0) = (center_x - box_width * 0.5) * inv_img_width;
-          // ymin
-          e_boxes(h, w, idx, 1) =
-              (center_y - box_height * 0.5) * inv_img_height;
-          // xmax
-          e_boxes(h, w, idx, 2) = (center_x + box_width * 0.5) * inv_img_width;
-          // ymax
-          e_boxes(h, w, idx, 3) =
-              (center_y + box_height * 0.5) * inv_img_height;
-
-          idx++;
-          if (max_sizes.size() > 0) {
-            int max_size = max_sizes[s];
-            // second prior: aspect_ratio = 1,
-            // size = sqrt(min_size * max_size)
-            box_width = box_height = sqrt(min_size * max_size);
-            // xmin
-            e_boxes(h, w, idx, 0) =
-                (center_x - box_width * 0.5) * inv_img_width;
-            // ymin
-            e_boxes(h, w, idx, 1) =
-                (center_y - box_height * 0.5) * inv_img_height;
-            // xmax
-            e_boxes(h, w, idx, 2) =
-                (center_x + box_width * 0.5) * inv_img_width;
-            // ymax
-            e_boxes(h, w, idx, 3) =
-                (center_y + box_height * 0.5) * inv_img_height;
-            idx++;
-          }
-
-          // rest of priors
-          for (size_t r = 0; r < aspect_ratios.size(); ++r) {
-            float ar = aspect_ratios[r];
-            if (fabs(ar - 1.) < 1e-6) {
-              continue;
-            }
-            box_width = min_size * sqrt(ar);
-            box_height = min_size / sqrt(ar);
-            // xmin
-            e_boxes(h, w, idx, 0) =
-                (center_x - box_width * 0.5) * inv_img_width;
-            // ymin
-            e_boxes(h, w, idx, 1) =
-                (center_y - box_height * 0.5) * inv_img_height;
-            // xmax
-            e_boxes(h, w, idx, 2) =
-                (center_x + box_width * 0.5) * inv_img_width;
-            // ymax
-            e_boxes(h, w, idx, 3) =
-                (center_y + box_height * 0.5) * inv_img_height;
-            idx++;
-          }
-        }
-      }
-    }
-
-    if (clip) {
-      platform::Transform<platform::CPUDeviceContext> trans;
-      ClipFunctor<T> clip_func;
-      trans(ctx.template device_context<platform::CPUDeviceContext>(),
-            boxes->data<T>(), boxes->data<T>() + boxes->numel(),
-            boxes->data<T>(), clip_func);
-    }
-
-    framework::Tensor var_t;
-    var_t.mutable_data<T>(
-        framework::make_ddim({1, static_cast<int>(variances.size())}),
-        ctx.GetPlace());
-    auto var_et = framework::EigenTensor<T, 2>::From(var_t);
-    for (size_t i = 0; i < variances.size(); ++i) {
-      var_et(0, i) = variances[i];
-    }
-
-    int box_num = feature_height * feature_width * num_priors;
-    auto var_dim = vars->dims();
-    vars->Resize({box_num, static_cast<int>(variances.size())});
-
-    auto e_vars = framework::EigenMatrix<T, Eigen::RowMajor>::From(*vars);
-    e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(box_num, 1));
-
-    vars->Resize(var_dim);
-  }
-};  // namespace operators
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/proximal_adagrad_op.cc b/paddle/operators/proximal_adagrad_op.cc
deleted file mode 100644
index b92f46b5bd4e48a25f8c87873c5df53f1753b71b..0000000000000000000000000000000000000000
--- a/paddle/operators/proximal_adagrad_op.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/proximal_adagrad_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ProximalAdagradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of ProximalAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment"),
-                   "Input(Moment) of ProximalAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of ProximalAdagradOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("LearningRate"),
-        "Input(LearningRate) of ProximalAdagradOp should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of ProximalAdagradOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("MomentOut"),
-        "Output(MomentOut) of ProximalAdagradOp should not be null.");
-
-    auto param_dim = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dim, ctx->GetInputDim("Grad"),
-        "Param and Grad of ProximalAdagrad Op must have same dimension.");
-
-    PADDLE_ENFORCE_EQ(
-        param_dim, ctx->GetInputDim("Moment"),
-        "Param and Moment of ProximalAdagrad Op must have same dimension.");
-
-    auto lr_dim = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
-                      "Learning Rate should be a scalar.");
-
-    ctx->SetOutputDim("ParamOut", param_dim);
-    ctx->SetOutputDim("MomentOut", param_dim);
-  }
-};
-
-class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ProximalAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Param",
-             "(Tensor, default Tensor<float>) "
-             "Input parameter that has to be updated.");
-    AddInput("Moment",
-             "(Tensor, default Tensor<float>) "
-             "Moment parameter that has to be updated.");
-    AddInput("Grad",
-             "(Tensor, default Tensor<float>) "
-             "Input gradient of the parameter.");
-    AddInput("LearningRate",
-             "(Tensor, default Tensor<float>) "
-             "The learning rate should be a tensor of size 1.");
-
-    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
-    AddOutput("MomentOut", "(Tensor) Output updated moment value.");
-
-    AddAttr<float>("l1",
-                   "(float, default 0.0) "
-                   "L1 regularization strength.")
-        .SetDefault(0.0f);
-    AddAttr<float>("l2",
-                   "(float, default 0.0) "
-                   "L2 regularization strength.")
-        .SetDefault(0.0f);
-    AddComment(R"DOC(
-Proximal Adagrad Optimizer.
-
-Optimizer that implements the proximal adagrad algorithm:
-
-$$
-moment = moment + grad * grad \\
-prox\_param = param - learning\_rate * grad * (1 / \sqrt{moment}) \\
-param = sign(prox\_param) / (1 + learning\_rate * l2) *
-        \max(|prox\_param| - learning\_rate * l1 , 0)
-$$
-
-The paper that proposed Proximal GD: 
-(http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
-Here, we use the adagrad learning rate as specified here: 
-(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(proximal_adagrad, ops::ProximalAdagradOp,
-                             ops::ProximalAdagradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    proximal_adagrad,
-    ops::ProximalAdagradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/proximal_adagrad_op.cu b/paddle/operators/proximal_adagrad_op.cu
deleted file mode 100644
index 42a178f94b94c8e80ec8f9b5e6471b75878b65d1..0000000000000000000000000000000000000000
--- a/paddle/operators/proximal_adagrad_op.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-You may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed
-under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/proximal_adagrad_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    proximal_adagrad,
-    ops::ProximalAdagradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/proximal_adagrad_op.h b/paddle/operators/proximal_adagrad_op.h
deleted file mode 100644
index 523924d80e127d9ad2483e6b239fb948aa72200c..0000000000000000000000000000000000000000
--- a/paddle/operators/proximal_adagrad_op.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class ProximalAdagradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* param_out = ctx.Output<Tensor>("ParamOut");
-    auto* moment_out = ctx.Output<Tensor>("MomentOut");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-    moment_out->mutable_data<T>(ctx.GetPlace());
-
-    auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
-    auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
-
-    auto grad = ctx.Input<Tensor>("Grad");
-    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
-    auto m = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment"));
-    auto g = EigenVector<T>::Flatten(*grad);
-    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
-
-    auto p_out = EigenVector<T>::Flatten(*param_out);
-    auto m_out = EigenVector<T>::Flatten(*moment_out);
-    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-
-    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
-
-    m_out.device(*place) = m + g * g;
-    auto prox_param = p - lr.broadcast(grad_dsize) * g / m_out.sqrt();
-    if (l1 > static_cast<T>(0)) {
-      p_out.device(*place) =
-          prox_param.sign() *
-          (((prox_param.abs() - (lr * l1).broadcast(grad_dsize))
-                .cwiseMax(static_cast<T>(0.0))) /
-           (static_cast<T>(1.0) + (lr * l2).broadcast(grad_dsize)));
-    } else {
-      p_out.device(*place) =
-          prox_param / (static_cast<T>(1.0) + (lr * l2).broadcast(grad_dsize));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/proximal_gd_op.cc b/paddle/operators/proximal_gd_op.cc
deleted file mode 100644
index 2d3bbdaf320a4d6bdf18ec92230a81ad98371498..0000000000000000000000000000000000000000
--- a/paddle/operators/proximal_gd_op.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/proximal_gd_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ProximalGDOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of ProximalGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of ProximalGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of ProximalGDOp should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of ProximalGDOp should not be null.");
-
-    auto param_dim = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Grad"),
-                      "Two input of ProximalGD Op's dimension must be same.");
-
-    auto lr_dim = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
-                      "Learning Rate should be a scalar.");
-
-    ctx->SetOutputDim("ParamOut", param_dim);
-  }
-};
-
-class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ProximalGDOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Param",
-             "(Tensor, default Tensor<float>) "
-             "Input parameter value that has to be updated.");
-    AddInput("Grad",
-             "(Tensor, default Tensor<float>) "
-             "Input gradient of the parameter.");
-    AddInput("LearningRate",
-             "(Tensor, default Tensor<float>) "
-             "The learning rate should be a tensor of size 1.");
-
-    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
-
-    AddAttr<float>("l1",
-                   "(float, default 0.0) "
-                   "L1 regularization strength.")
-        .SetDefault(0.0f);
-    AddAttr<float>("l2",
-                   "(float, default 0.0) "
-                   "L2 regularization strength.")
-        .SetDefault(0.0f);
-    AddComment(R"DOC(
-ProximalGD Operator.
-
-Optimizer that implements the proximal gradient descent algorithm:
-
-$$
-prox\_param = param - learning\_rate * grad \\
-param = sign(prox\_param) / (1 + learning\_rate * l2) *
-        \max(|prox\_param| - learning\_rate * l1, 0)
-$$        
-
-The paper that proposed Proximal Gradient Descent:
-(http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(proximal_gd, ops::ProximalGDOp,
-                             ops::ProximalGDOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    proximal_gd,
-    ops::ProximalGDOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/proximal_gd_op.cu b/paddle/operators/proximal_gd_op.cu
deleted file mode 100644
index b7dd840d19a13cd3329fb68563693a80d22291ca..0000000000000000000000000000000000000000
--- a/paddle/operators/proximal_gd_op.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-You may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed
-under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/proximal_gd_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    proximal_gd,
-    ops::ProximalGDOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/proximal_gd_op.h b/paddle/operators/proximal_gd_op.h
deleted file mode 100644
index 64648b3ccaf9615c995d65464607105d87c04198..0000000000000000000000000000000000000000
--- a/paddle/operators/proximal_gd_op.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class ProximalGDOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* param_out = ctx.Output<Tensor>("ParamOut");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-
-    auto grad = ctx.Input<Tensor>("Grad");
-
-    auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
-    auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
-
-    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
-    auto g = EigenVector<T>::Flatten(*grad);
-    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
-
-    auto p_out = EigenVector<T>::Flatten(*param_out);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
-
-    auto prox_param = p - lr.broadcast(grad_dsize) * g;
-    if (l1 > 0) {
-      p_out.device(place) =
-          prox_param.sign() *
-          (((prox_param.abs() - (lr * l1).broadcast(grad_dsize))
-                .cwiseMax(T(0.0))) /
-           (1.0 + (lr * l2).broadcast(grad_dsize)));
-    } else {
-      p_out.device(place) =
-          prox_param / (1.0 + (lr * l2).broadcast(grad_dsize));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
deleted file mode 100644
index f2164a0f80519ed9c2490ab3aa6809dc84c6070d..0000000000000000000000000000000000000000
--- a/paddle/operators/rank_loss_op.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/rank_loss_op.h"
-
-namespace paddle {
-namespace operators {
-
-class RankLossOp : public framework::OperatorWithKernel {
- public:
-  RankLossOp(const std::string &type, const framework::VariableNameMap &inputs,
-             const framework::VariableNameMap &outputs,
-             const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    // input check
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null.");
-
-    auto label_dims = ctx->GetInputDim("Label");
-    auto left_dims = ctx->GetInputDim("Left");
-    auto right_dims = ctx->GetInputDim("Right");
-
-    PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims),
-                   "All inputs must have the same size.");
-    PADDLE_ENFORCE(
-        (label_dims.size() == 2) && (label_dims[1] == 1),
-        "All inputs must be 2-D tensors with shape [batch_size x 1].");
-    ctx->SetOutputDim("Out", label_dims);
-  }
-};
-
-class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  RankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Label",
-             "(2-D Tensor with shape [batch_size x 1]) "
-             "The label indicating A ranked higher than B or not.");
-    AddInput("Left",
-             "(2-D Tensor with shape [batch_size x 1]) "
-             "The output of RankNet for doc A.");
-    AddInput("Right",
-             "(2-D Tensor with shape [batch_size x 1]) "
-             "The output of RankNet for doc B.");
-    AddOutput("Out",
-              "(2-D Tensor with shape [batch_size x 1]) "
-              "The output loss of RankLoss operator.");
-    AddComment(R"DOC(
-RankLoss Operator.
-
-RankLoss operator for RankNet
-(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf). 
-RankNet is a pairwise ranking model with
-one training sample consisting of a pair of doc A and B, and the label P
-indicating that A is ranked higher than B or not:
-
-P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of
-the input pair.
-
-The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label
-(P_{i,j}), which represent the output score of RankNet for the two docs and 
-the label respectively, and yields the rank loss C_{i,j} using the following 
-equation:
-
-$$
-  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
-  o_{i,j} =  o_i - o_j  \\
-  \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
-$$
-
-The operator can take batch inputs with size batch_size (batch_size >= 1).
-
-)DOC");
-  }
-};
-
-class RankLossGradOp : public framework::OperatorWithKernel {
- public:
-  RankLossGradOp(const std::string &type,
-                 const framework::VariableNameMap &inputs,
-                 const framework::VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) shouldn't be null.");
-    auto dims = ctx->GetInputDim("Left");
-    auto left_grad_name = framework::GradVarName("Left");
-    auto right_grad_name = framework::GradVarName("Right");
-
-    if (ctx->HasOutput(left_grad_name)) {
-      ctx->SetOutputDim(left_grad_name, dims);
-    }
-
-    if (ctx->HasOutput(right_grad_name)) {
-      ctx->SetOutputDim(right_grad_name, dims);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OP(rank_loss, ops::RankLossOp, ops::RankLossOpMaker, rank_loss_grad,
-            ops::RankLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    rank_loss, ops::RankLossKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    rank_loss_grad,
-    ops::RankLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/rank_loss_op.cu b/paddle/operators/rank_loss_op.cu
deleted file mode 100644
index 294b22738347b17dd67df05291ac496bfb608323..0000000000000000000000000000000000000000
--- a/paddle/operators/rank_loss_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/rank_loss_op.h"
-
-REGISTER_OP_CUDA_KERNEL(rank_loss,
-                        paddle::operators::RankLossKernel<
-                            paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(rank_loss_grad,
-                        paddle::operators::RankLossGradKernel<
-                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/rank_loss_op.h b/paddle/operators/rank_loss_op.h
deleted file mode 100644
index bd0c49ca6e42bcb1a25c53421c0e672cecbb3a15..0000000000000000000000000000000000000000
--- a/paddle/operators/rank_loss_op.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class RankLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_t = ctx.Output<framework::Tensor>("Out");
-    auto* label_t = ctx.Input<framework::Tensor>("Label");
-    auto* left_t = ctx.Input<framework::Tensor>("Left");
-    auto* right_t = ctx.Input<framework::Tensor>("Right");
-    out_t->mutable_data<T>(ctx.GetPlace());
-
-    auto out = framework::EigenVector<T>::Flatten(*out_t);
-    auto label = framework::EigenVector<T>::Flatten(*label_t);
-    auto left = framework::EigenVector<T>::Flatten(*left_t);
-    auto right = framework::EigenVector<T>::Flatten(*right_t);
-
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    out.device(dev) =
-        (1. + (left - right).exp()).log() - label * (left - right);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class RankLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_left_t =
-        ctx.Output<framework::Tensor>(framework::GradVarName("Left"));
-    auto* d_right_t =
-        ctx.Output<framework::Tensor>(framework::GradVarName("Right"));
-
-    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* label_t = ctx.Input<framework::Tensor>("Label");
-    auto* left_t = ctx.Input<framework::Tensor>("Left");
-    auto* right_t = ctx.Input<framework::Tensor>("Right");
-
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
-    auto label = framework::EigenVector<T>::Flatten(*label_t);
-    auto left = framework::EigenVector<T>::Flatten(*left_t);
-    auto right = framework::EigenVector<T>::Flatten(*right_t);
-
-    // compute d_left
-    if (d_left_t) {
-      d_left_t->mutable_data<T>(ctx.GetPlace());
-      auto d_left = framework::EigenVector<T>::Flatten(*d_left_t);
-      d_left.device(dev) = d_out * (1. / (1. + (right - left).exp()) - label);
-    }
-    // compute d_right
-    if (d_right_t) {
-      d_right_t->mutable_data<T>(ctx.GetPlace());
-      auto d_right = framework::EigenVector<T>::Flatten(*d_right_t);
-      d_right.device(dev) =
-          -d_out * (1.0 / (1. + (right - left).exp()) - label);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/read_op.cc b/paddle/operators/read_op.cc
deleted file mode 100644
index 3ae454101f585cf412a306fd3198f99fbdb8324d..0000000000000000000000000000000000000000
--- a/paddle/operators/read_op.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/reader.h"
-
-namespace paddle {
-namespace operators {
-
-class ReadInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Reader"),
-                   "The ReadOp must take a reader as input.");
-    PADDLE_ENFORCE(ctx->HasOutputs("Out"),
-                   "The ReadOp should be assigned with output.");
-    std::vector<framework::DDim> reader_dims = ctx->GetReaderDims("Reader");
-    std::vector<std::string> out_names = ctx->Outputs("Out");
-    PADDLE_ENFORCE_EQ(
-        reader_dims.size(), out_names.size(),
-        "The reader's dim number doesn't match the output number.");
-    ctx->SetOutputsDim("Out", reader_dims);
-  }
-};
-
-class ReadInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    std::string reader_name = op_desc.Input("Reader")[0];
-    std::vector<std::string> out_names = op_desc.Output("Out");
-    framework::VarDesc* reader = block->FindVarRecursive(reader_name);
-    auto dtypes = reader->GetDataTypes();
-    PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
-    for (size_t i = 0; i < dtypes.size(); ++i) {
-      framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
-      out.SetType(framework::proto::VarDesc::LOD_TENSOR);
-      out.SetDataType(dtypes[i]);
-    }
-  }
-};
-
-class ReadOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
-    framework::ReaderHolder* reader =
-        scope.FindVar(Input("Reader"))->GetMutable<framework::ReaderHolder>();
-    if (!reader->HasNext()) {
-      reader->ReInit();
-      PADDLE_ENFORCE(
-          reader->HasNext(),
-          "Reader can not read the next data even it has been re-initialized.");
-    }
-    std::vector<std::string> out_arg_names = Outputs("Out");
-    std::vector<framework::LoDTensor> ins;
-    reader->ReadNext(&ins);
-    PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
-    for (size_t i = 0; i < ins.size(); ++i) {
-      auto* out =
-          scope.FindVar(out_arg_names[i])->GetMutable<framework::LoDTensor>();
-      out->ShareDataWith(ins[i]);
-      out->set_lod(ins[i].lod());
-    }
-  }
-};
-
-class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ReadOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(op_proto, op_checker) {
-    AddInput("Reader", "(ReaderHolder) The executed reader.");
-    AddOutput("Out", "(LoDTensor) The output data.").AsDuplicable();
-    AddComment(R"DOC(
-      Read Operator
-
-      Execute a given reader once and output data.
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(read, ops::ReadOp, ops::ReadInferShape, ops::ReadOpMaker,
-                  paddle::framework::EmptyGradOpMaker, ops::ReadInferVarType);
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
deleted file mode 100644
index a136c5b447d7a64f783c00c928bf9e248aff6649..0000000000000000000000000000000000000000
--- a/paddle/operators/recurrent_op.cc
+++ /dev/null
@@ -1,635 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-#include "paddle/framework/executor.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-constexpr char kInputs[] = "inputs";
-constexpr char kInitialStates[] = "initial_states";
-constexpr char kParameters[] = "parameters";
-constexpr char kOutputs[] = "outputs";
-constexpr char kStepScopes[] = "step_scopes";
-constexpr char kExStates[] = "ex_states";
-constexpr char kStates[] = "states";
-constexpr char kStepBlock[] = "sub_block";
-constexpr char kReverse[] = "reverse";
-constexpr char kIsTrain[] = "is_train";
-#define GRAD_SUFFIX "@GRAD"
-constexpr char kInputGrads[] = "inputs" GRAD_SUFFIX;
-constexpr char kOutputGrads[] = "outputs" GRAD_SUFFIX;
-constexpr char kParamGrads[] = "parameters" GRAD_SUFFIX;
-constexpr char kInitStateGrads[] = "initial_states" GRAD_SUFFIX;
-
-using StepScopeVar = std::vector<framework::Scope *>;
-
-// StepScopes manages scopes inside RNN.
-//    StepScopes::CurScope() get the current scope
-//    StepScopes::ExScope() get the ex-scope, or scope in previous time step.
-//    StepScopes::Next() move to next time step.
-//
-// if is_train = False, then
-//   there are two scopes for the RNN and just support forward.
-// else
-//   the len(scopes) == seq_len
-//
-// if is_backward = True, then
-//   reversely access scopes
-// else
-//   access scopes from begin to end.
-class StepScopes {
- public:
-  StepScopes(const framework::Scope &parent, StepScopeVar *scopes,
-             bool is_train, size_t seq_len, bool is_backward = false)
-      : counter_(is_backward ? seq_len - 1 : 0UL),
-        scopes_(scopes),
-        is_train_(is_train),
-        is_backward_(is_backward) {
-    size_t num_step_scopes = is_train ? seq_len : 2;
-    PADDLE_ENFORCE(is_train || !is_backward,
-                   "Cannot backward when is not training");
-    if (!is_backward_) {
-      PADDLE_ENFORCE(scopes->empty());
-      scopes->reserve(static_cast<size_t>(num_step_scopes));
-      for (size_t i = 0; i < num_step_scopes; ++i) {
-        scopes->emplace_back(&parent.NewScope());
-      }
-    }
-  }
-
-  framework::Scope &CurScope() { return GetScope(counter_); }
-
-  framework::Scope &ExScope() {
-    auto &scope = GetScope(is_backward_ ? counter_ + 1 : counter_ - 1);
-    return scope;
-  }
-
-  void Next() {
-    if (is_backward_) {
-      --counter_;
-    } else {
-      ++counter_;
-    }
-  }
-
- private:
-  framework::Scope &GetScope(size_t scope_id) const {
-    if (!is_train_) {
-      scope_id %= 2;
-    }
-    PADDLE_ENFORCE_LT(scope_id, scopes_->size());
-    return *(*scopes_)[scope_id];
-  }
-
-  size_t counter_;
-  StepScopeVar *scopes_;
-  bool is_train_;
-  bool is_backward_;
-};
-
-// Base class for RecurrentOp/RecurrentGradOp
-//    Some common protected functions for RecurrentOp/RecurrentGradOp
-class RecurrentBase : public framework::OperatorBase {
- public:
-  RecurrentBase(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- protected:
-  // Get SequenceLength from Scope
-  //   The sequence length is got from input tensor. The input tensor's
-  //   dimension should be [SEQ_LEN, ..., ...]. The first of the tensor's shape
-  //   is SEQ_LEN. The second of the tensor's shape could be the batch size or
-  //   nested sequence length.
-  int64_t GetSequenceLength(const framework::Scope &scope) const {
-    // Dim format SEQ_LEN, BATCH_SIZE, ...
-    int64_t seq_len = -1;
-    auto &all_inputs = Inputs(kInputs);
-    PADDLE_ENFORCE(!all_inputs.empty());
-    for (auto &iname : all_inputs) {
-      auto *var = scope.FindVar(iname);
-      PADDLE_ENFORCE(var != nullptr);
-      PADDLE_ENFORCE(var->IsType<framework::LoDTensor>());
-      auto &dim = var->Get<framework::LoDTensor>().dims();
-      if (seq_len == -1) {
-        seq_len = dim[0];
-      } else {
-        PADDLE_ENFORCE_EQ(seq_len, dim[0]);
-      }
-    }
-    return seq_len;
-  }
-
-  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
-  //                                   map(dst_scope.Var, dst_vars)):
-  //   dst_tensor.ShareDataWith(src_tensor)
-  static void LinkTensor(const framework::Scope &src_scope,
-                         const std::vector<std::string> &src_vars,
-                         framework::Scope *dst_scope,
-                         const std::vector<std::string> &dst_vars) {
-    LinkTensorWithCallback(
-        src_scope, src_vars, dst_scope, dst_vars,
-        [&](const framework::Tensor &src, framework::Tensor *dst) {
-          dst->ShareDataWith(src);
-        });
-  }
-
-  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
-  //                                   map(dst_scope.Var, dst_vars)):
-  //   callback(src_tensor, &dst_tensor)
-  template <typename Callback>
-  static void LinkTensorWithCallback(const framework::Scope &src_scope,
-                                     const std::vector<std::string> &src_vars,
-                                     framework::Scope *dst_scope,
-                                     const std::vector<std::string> &dst_vars,
-                                     Callback callback) {
-    PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
-    for (size_t i = 0; i < dst_vars.size(); ++i) {
-      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
-      AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback);
-    }
-  }
-
-  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
-  //                                   map(dst_scope.FindVar, dst_vars)):
-  //   callback(src_tensor, &dst_tensor)
-  template <typename Callback>
-  static void LinkTensorWithCallback(const framework::Scope &src_scope,
-                                     const std::vector<std::string> &src_vars,
-                                     const framework::Scope &dst_scope,
-                                     const std::vector<std::string> &dst_vars,
-                                     Callback callback) {
-    PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
-    for (size_t i = 0; i < dst_vars.size(); ++i) {
-      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
-      AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback);
-    }
-  }
-
-  // (seq_len, shape) -> return [seq_len] + list(shape)
-  static framework::DDim PrependDims(size_t seq_len,
-                                     const framework::DDim &src) {
-    auto dims = framework::vectorize(src);
-    dims.insert(dims.begin(), static_cast<int64_t>(seq_len));
-    return framework::make_ddim(dims);
-  }
-
- private:
-  template <typename Callback>
-  static void AccessTensor(const framework::Scope &src_scope,
-                           const std::string &src_var_name,
-                           framework::Scope *dst_scope,
-                           const std::string &dst_var_name, Callback callback) {
-    auto *src_var = src_scope.FindVar(src_var_name);
-    PADDLE_ENFORCE(src_var != nullptr);
-    auto &src_tensor = src_var->Get<framework::LoDTensor>();
-
-    auto *dst_var = dst_scope->Var(dst_var_name);
-    auto *dst_tensor = dst_var->GetMutable<framework::LoDTensor>();
-    callback(src_tensor, dst_tensor);
-  }
-
-  template <typename Callback>
-  static void AccessTensor(const framework::Scope &src_scope,
-                           const std::string &src_var_name,
-                           const framework::Scope &dst_scope,
-                           const std::string &dst_var_name, Callback callback) {
-    auto *src_var = src_scope.FindVar(src_var_name);
-    PADDLE_ENFORCE(src_var != nullptr);
-    auto &src_tensor = src_var->Get<framework::LoDTensor>();
-    auto *dst_var = dst_scope.FindVar(dst_var_name);
-    PADDLE_ENFORCE(dst_var != nullptr);
-    auto *dst_tensor = dst_var->GetMutable<framework::LoDTensor>();
-    callback(src_tensor, dst_tensor);
-  }
-};
-
-class RecurrentOp : public RecurrentBase {
- public:
-  RecurrentOp(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : RecurrentBase(type, inputs, outputs, attrs) {}
-
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
-    VLOG(3) << "Static RNN input sequence length = " << seq_len;
-    StepScopes scopes = CreateStepScopes(scope, seq_len);
-    auto reverse = Attr<bool>(kReverse);
-
-    framework::Executor executor(place);
-    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
-
-    auto *program = block->Program();
-
-    for (size_t i = 0; i < seq_len; ++i) {
-      size_t seq_offset = reverse ? seq_len - i - 1 : i;
-      VLOG(3) << "Recurrent operate at the time step " << seq_offset;
-
-      auto &cur_scope = scopes.CurScope();
-
-      // Link outside::input --> inside::input
-      //   inside::input = outside::input[seq_offset: seq_offset+1]
-      LinkTensorWithCallback(
-          scope, Inputs(kInputs), &cur_scope, Inputs(kInputs),
-          [&seq_offset](const framework::Tensor &outside,
-                        framework::Tensor *inside) {
-            inside->ShareDataWith(outside.Slice(seq_offset, seq_offset + 1));
-            auto dims = framework::vectorize(inside->dims());
-            dims.erase(dims.begin());
-            inside->Resize(framework::make_ddim(dims));
-          });
-
-      if (i == 0) {
-        // Link initial states  --> ex_states
-        LinkTensor(scope, Inputs(kInitialStates), &cur_scope,
-                   Attr<std::vector<std::string>>(kExStates));
-      } else {
-        auto &ex_scope = scopes.ExScope();
-        // Link ex_scope::state --> cur_scope::ex_state
-        LinkTensor(ex_scope, Attr<std::vector<std::string>>(kStates),
-                   &cur_scope, Attr<std::vector<std::string>>(kExStates));
-      }
-
-      // Every inputs are linked now, execute!
-      executor.Run(*program, &cur_scope, block->ID(),
-                   false /*create_local_scope*/);
-
-      // get device context from pool
-      platform::DeviceContextPool &pool =
-          platform::DeviceContextPool::Instance();
-      auto &dev_ctx = *pool.Get(place);
-
-      // Copy inside::output -> outside::output
-      //    outside::output[seq_offset: seq_offset + 1] = inside::output
-      this->LinkTensorWithCallback(
-          cur_scope, Outputs(kOutputs), scope, Outputs(kOutputs),
-          [&](const framework::LoDTensor &src_tensor,
-              framework::LoDTensor *dst_tensor) {
-            if (i == 0) {  // create output tensor at begin
-              dst_tensor->Resize(PrependDims(seq_len, src_tensor.dims()));
-              dst_tensor->mutable_data(place, src_tensor.type());
-            }
-
-            auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
-            // Explicit copy output since the local RNN scope can be destroyed
-            // early.
-            framework::Copy(src_tensor, place, dev_ctx, &dst_out);
-          });
-
-      scopes.Next();
-    }
-  }
-
- private:
-  StepScopes CreateStepScopes(const framework::Scope &scope,
-                              size_t seq_len) const {
-    auto *var = scope.FindVar(Output(kStepScopes));
-    PADDLE_ENFORCE(var != nullptr);
-    return StepScopes(scope, var->GetMutable<StepScopeVar>(),
-                      Attr<bool>(kIsTrain), seq_len);
-  }
-};
-
-class RecurrentGradOp : public RecurrentBase {
- public:
-  RecurrentGradOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs)
-      : RecurrentBase(type, inputs, outputs, attrs) {}
-
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    auto seq_len = static_cast<size_t>(GetSequenceLength(scope));
-    StepScopes scopes = CreateStepScopes(scope, seq_len);
-    auto reverse = Attr<bool>(kReverse);
-
-    framework::Executor executor(place);
-    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
-
-    auto *program = block->Program();
-
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    for (size_t step_id = 0; step_id < seq_len; ++step_id) {
-      size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
-      VLOG(3) << "Recurrent backward operate at the time step " << seq_offset;
-      auto &cur_scope = scopes.CurScope();
-      // Link outside::output_grads --> inside::output_grads
-      //   inside::output_grad = outside::output_grad[seq_offset:seq_offset+1]
-      LinkTensorWithCallback(
-          scope, Inputs(kOutputGrads), &cur_scope, Inputs(kOutputGrads),
-          [&](const framework::Tensor &outside, framework::Tensor *inside) {
-            inside->ShareDataWith(outside.Slice(seq_offset, seq_offset + 1));
-            auto dims = framework::vectorize(inside->dims());
-            dims.erase(dims.begin());
-            inside->Resize(framework::make_ddim(dims));
-          });
-      auto og_set = List2Set(Inputs(kOutputGrads));
-
-      if (VLOG_IS_ON(10)) {
-        std::ostringstream sout;
-        std::copy(og_set.begin(), og_set.end(),
-                  std::ostream_iterator<std::string>(sout, ","));
-        VLOG(10) << " RNN output gradients = [" << sout.str() << "]";
-      }
-
-      // Link states
-      //   if cur_scope::cur_state_grad in out_grads:
-      //     cur_scope::cur_state_grad += ex_scope::ex_state_grad
-      //   else:
-      //     ex_scope::ex_state_grad --> cur_scope::cur_state_grad
-      if (step_id != 0) {  // not at beginning
-        auto &ex_scope = scopes.ExScope();
-        auto ex_state_grads =
-            GradVarLists(Attr<std::vector<std::string>>(kExStates));
-        auto cur_state_grads =
-            GradVarLists(Attr<std::vector<std::string>>(kStates));
-
-        PADDLE_ENFORCE_EQ(ex_state_grads.size(), cur_state_grads.size());
-        for (size_t i = 0; i < ex_state_grads.size(); ++i) {
-          auto &cur_grad = cur_state_grads[i];
-          auto &ex_grad = ex_state_grads[i];
-          auto &ex_tensor =
-              ex_scope.FindVar(ex_grad)->Get<framework::LoDTensor>();
-
-          VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad;
-          auto *cur_grad_var = cur_scope.Var(cur_grad);
-          auto cur_grad_tensor =
-              cur_grad_var->GetMutable<framework::LoDTensor>();
-          framework::Copy(ex_tensor, place, dev_ctx, cur_grad_tensor);
-        }
-      }
-
-      VLOG(5) << "Recurrent memory linking finished ";
-      // Run step block with cur_scope
-      executor.Run(*program, &cur_scope, block->ID(),
-                   false /*create_local_scope*/);
-
-      VLOG(5) << "executor.Run finished ";
-
-      auto local_var_names = LocalVarNames(cur_scope);
-
-      // Accumulate params
-      //   if (step == 0):
-      //      outside::param_grad = 0.0
-      //   outside::param_grad += inside::param_grad
-      {
-        auto &pg_names = Outputs(kParamGrads);
-        auto &p_names = Inputs(kParameters);
-        PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
-
-        for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
-          auto inside_grad_name = framework::GradVarName(p_names[param_id]);
-
-          // If does not compute gradient of that variable inside rnn, just
-          // continue
-          if (local_var_names.find(inside_grad_name) == local_var_names.end()) {
-            continue;
-          }
-
-          // zero gradient variable in step 0
-          if (step_id == 0) {
-            auto &inside_tensor = cur_scope.FindVar(inside_grad_name)
-                                      ->Get<framework::LoDTensor>();
-            framework::AttributeMap attrs;
-            attrs["dtype"] = framework::ToDataType(inside_tensor.type());
-            attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
-            attrs["value"] = 0.0f;
-
-            auto zero_op = framework::OpRegistry::CreateOp(
-                "fill_constant", framework::VariableNameMap{},
-                {{"Out", {pg_names[param_id]}}}, attrs);
-            zero_op->Run(scope, place);
-          }
-
-          auto new_inside_name = cur_scope.Rename(inside_grad_name);
-          // sum gradient
-
-          auto sum_op = framework::OpRegistry::CreateOp(
-              "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-              {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
-          sum_op->Run(cur_scope, place);
-
-          cur_scope.Rename(new_inside_name, inside_grad_name);
-        }
-      }
-      VLOG(5) << "Accumulate Parameter finished ";
-
-      // Copy input gradient from inside to outside
-      //   outside::input_grad[seq_offset: seq_offset + 1] = inside::input_grad
-      LinkTensorWithCallback(
-          cur_scope, GradVarLists(Inputs(kInputs)), scope, Outputs(kInputGrads),
-          [&](const framework::LoDTensor &inside,
-              framework::LoDTensor *outside) {
-            if (inside.memory_size() == 0) {  // IG is not created.
-              return;
-            }
-            if (step_id == 0) {  // alloc memory
-              outside->Resize(PrependDims(seq_len, inside.dims()));
-              outside->mutable_data(place, inside.type());
-            }
-
-            auto dst = outside->Slice(seq_offset, seq_offset + 1);
-            framework::Copy(inside, place, dev_ctx, &dst);
-          });
-      VLOG(5) << "Link outside gradient finished ";
-
-      if (step_id + 1 == seq_len) {  // at_end
-        // copy initialize states gradient from inside to outside
-        LinkTensorWithCallback(
-            cur_scope, GradVarLists(Attr<std::vector<std::string>>(kExStates)),
-            scope, Outputs(kInitStateGrads),
-            [&](const framework::LoDTensor &inside,
-                framework::LoDTensor *outside) {
-              outside->Resize(inside.dims());
-              outside->mutable_data(place, inside.type());
-              framework::Copy(inside, place, dev_ctx, outside);
-            });
-        VLOG(5) << "Link initialize state gradient finished ";
-      }
-      scopes.Next();
-    }
-  }
-
- private:
-  StepScopes CreateStepScopes(const framework::Scope &scope,
-                              size_t seq_len) const {
-    auto *var = scope.FindVar(Input(kStepScopes));
-    PADDLE_ENFORCE(var != nullptr);
-    return StepScopes(scope, var->GetMutable<StepScopeVar>(),
-                      Attr<bool>(kIsTrain), seq_len, true /*is_backward*/);
-  }
-
-  std::unordered_set<std::string> List2Set(
-      const std::vector<std::string> &list) const {
-    std::unordered_set<std::string> local_var_name_set;
-    local_var_name_set.reserve(list.size());
-    for (auto &each : list) {
-      local_var_name_set.insert(each);
-    }
-    return local_var_name_set;
-  }
-
-  std::unordered_set<std::string> LocalVarNames(
-      const framework::Scope &scope) const {
-    return this->List2Set(scope.LocalVarNames());
-  }
-  static std::vector<std::string> GradVarLists(
-      const std::vector<std::string> &var_names) {
-    std::vector<std::string> retv;
-    retv.reserve(var_names.size());
-    std::transform(var_names.begin(), var_names.end(), std::back_inserter(retv),
-                   framework::GradVarName);
-    return retv;
-  }
-};
-
-class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  RecurrentOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(kInputs, "rnn inputs").AsDuplicable();
-    AddInput(kInitialStates, "rnn initial states").AsDuplicable();
-    AddInput(kParameters,
-             "Parameters are used by step block as its input. However, the "
-             "input is not a sequence tensor. Every time step, each operator "
-             "in step block just use the parameter directly.")
-        .AsDuplicable();
-    AddOutput(kOutputs,
-              "The output sequence of RNN. The sequence length must be same.")
-        .AsDuplicable();
-    AddOutput(kStepScopes,
-              "StepScopes contain all local variables in each time step.");
-    AddAttr<std::vector<std::string>>(kExStates,
-                                      string::Sprintf(
-                                          R"DOC(The ex-state variable names.
-The ex-state means the state value in the ex-timestep or the previous time step
-[%s, %s, %s] must be the same order)DOC",
-                                          kExStates, kStates, kInitStateGrads));
-    AddAttr<std::vector<std::string>>(
-        kStates,
-        string::Sprintf(
-            "The state variable names. [%s, %s, %s] must be the same order",
-            kExStates, kStates, kInitStateGrads));
-    AddAttr<framework::BlockDesc *>(kStepBlock, "The step block inside RNN");
-    AddAttr<bool>(kReverse, R"DOC(Calculate RNN reversely or not.
-By default reverse=False
-
-Assume the input data is [A, B, C, D]
-
-if reverse is False:
-  the computation of RNN is like
-      A          B          C         D
-      |          |          |         |
-      v          v          v         v
-     rnn -----> rnn -----> rnn ----> rnn
-      |          |          |         |
-      v          v          v         v
-      o          o          o         o
-
-if reverse is True
-  the computation of RNN is like
-      A          B          C         D
-      |          |          |         |
-      v          v          v         v
-     rnn <----- rnn <----- rnn <---- rnn
-      |          |          |         |
-      v          v          v         v
-      o          o          o         o
-)DOC").SetDefault(false);
-    AddAttr<bool>(kIsTrain, "").SetDefault(true);
-    AddComment(R"DOC(
-Static Length Recurrent Operator.
-
-The static length recurrent operator can only operate on fixed size sequence
-data, i.e. in each mini-batch, the sequence length of all inputs are the same.
-
-)DOC");
-  }
-};
-
-class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  virtual std::unique_ptr<framework::OpDesc> Apply() const {
-    auto *grad = new framework::OpDesc();
-    grad->SetType("recurrent_grad");
-    for (auto &input_param : this->InputNames()) {
-      grad->SetInput(input_param, this->Input(input_param));
-      grad->SetOutput(framework::GradVarName(input_param),
-                      this->InputGrad(input_param, false));
-    }
-
-    for (auto &output_param : this->OutputNames()) {
-      if (output_param == kStepScopes) {
-        grad->SetInput(output_param, this->Output(output_param));
-        grad->SetInput(framework::GradVarName(output_param),
-                       this->Output(output_param));
-      } else {
-        grad->SetInput(output_param, this->Output(output_param));
-        grad->SetInput(framework::GradVarName(output_param),
-                       this->OutputGrad(output_param));
-      }
-    }
-    grad->SetAttrMap(this->Attrs());
-    grad->SetBlockAttr(kStepBlock, *grad_block_[0]);
-
-    return std::unique_ptr<framework::OpDesc>(grad);
-  }
-};
-
-class RecurrentGradOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    std::vector<std::string> input{kInputs, kInitialStates};
-    std::vector<std::string> output{kOutputs};
-    for (auto &s : input) {
-      PADDLE_ENFORCE(ctx->HasInputs(s));
-      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)),
-                     "Cannot find the gradient variable %s",
-                     framework::GradVarName(s));
-    }
-    for (auto &s : output) {
-      PADDLE_ENFORCE(ctx->HasInputs(s));
-    }
-    for (auto &s : input) {
-      ctx->SetOutputsDim(framework::GradVarName(s), ctx->GetInputsDim(s));
-    }
-    if (ctx->HasInputs(kParameters)) {
-      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
-      ctx->SetOutputsDim(framework::GradVarName(kParameters),
-                         ctx->GetInputsDim(kParameters));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(recurrent, paddle::operators::RecurrentOp,
-                  paddle::operators::RecurrentOpProtoMaker,
-                  paddle::operators::RecurrentGradOpDescMaker);
-REGISTER_OPERATOR(recurrent_grad, paddle::operators::RecurrentGradOp,
-                  paddle::operators::RecurrentGradOpShapeInference);
diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
deleted file mode 100644
index ba71094219f37eb7a3c2df68be986cec7afbf7ab..0000000000000000000000000000000000000000
--- a/paddle/operators/recv_op.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <ostream>
-
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_registry.h"
-
-#include <future>
-#include "paddle/operators/detail/grpc_client.h"
-
-namespace paddle {
-namespace operators {
-
-class RecvOp : public framework::OperatorBase {
- public:
-  RecvOp(const std::string& type, const framework::VariableNameMap& inputs,
-         const framework::VariableNameMap& outputs,
-         const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void Run(const framework::Scope& scope,
-           const platform::Place& place) const override {
-    auto outs = Outputs("Out");
-    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-
-    for (size_t i = 0; i < outs.size(); i++) {
-      VLOG(3) << "getting " << outs[i];
-      client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
-    }
-    PADDLE_ENFORCE(client_.Wait());
-  }
-
- private:
-  mutable detail::RPCClient client_;
-};
-
-class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  RecvOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable();
-    AddComment(R"DOC(
-Recv operator
-
-This operator can get variables from server side.
-)DOC");
-    AddAttr<std::vector<std::string>>("epmap",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints in the order of input "
-                                      "variables for mapping")
-        .SetDefault({});
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker);
diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc
deleted file mode 100644
index 84f24a909597915f0eebb6c9cad37510cbe93e7b..0000000000000000000000000000000000000000
--- a/paddle/operators/reduce_op.cc
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/reduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class ReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ReduceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ReduceOp should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
-    int dim = ctx->Attrs().Get<int>("dim");
-    if (dim < 0) dim = x_rank + dim;
-    PADDLE_ENFORCE_LT(
-        dim, x_rank,
-        "The dim should be in the range [-rank(input), rank(input)).");
-    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
-    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
-    if (reduce_all) {
-      if (keep_dim)
-        ctx->SetOutputDim(
-            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
-      else
-        ctx->SetOutputDim("Out", {1});
-    } else {
-      auto dims_vector = vectorize(x_dims);
-      if (keep_dim || x_rank == 1) {
-        dims_vector[dim] = 1;
-      } else {
-        dims_vector.erase(dims_vector.begin() + dim);
-      }
-      auto out_dims = framework::make_ddim(dims_vector);
-      ctx->SetOutputDim("Out", out_dims);
-      if (dim != 0) {
-        // Only pass LoD when not reducing on the first dim.
-        ctx->ShareLoD("X", /*->*/ "Out");
-      }
-    }
-  }
-};
-
-class ReduceGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
-    int dim = ctx->Attrs().Get<int>("dim");
-    if (dim < 0) dim = x_rank + dim;
-    PADDLE_ENFORCE_LT(
-        dim, x_rank,
-        "The dim should be in the range [-rank(input), rank(input)).");
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-      ctx->ShareLoD("X", /*->*/ x_grad_name);
-    }
-  }
-};
-
-class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(Tensor) The input tensor. Tensors with rank at most 6 are "
-             "supported.");
-    AddOutput("Out", "(Tensor) The result tensor.");
-    AddAttr<int>(
-        "dim",
-        "(int, default 0) The dimension to reduce. "
-        "Must be in the range [-rank(input), rank(input)). "
-        "If `dim < 0`, the dim to reduce is `rank + dim`. "
-        "Note that reducing on the first dim will make the LoD info lost.")
-        .SetDefault(0);
-    AddAttr<bool>("keep_dim",
-                  "(bool, default false) "
-                  "If true, retain the reduced dimension with length 1.")
-        .SetDefault(false);
-    AddAttr<bool>("reduce_all",
-                  "(bool, default false) "
-                  "If true, output a scalar reduced along all dimensions.")
-        .SetDefault(false);
-    comment_ = R"DOC(
-{ReduceOp} Operator.
-
-This operator computes the {reduce} of input tensor along the given dimension. 
-The result tensor has 1 fewer dimension than the input unless keep_dim is true.
-If reduce_all is true, just reduce along all dimensions and output a scalar.
-
-)DOC";
-    AddComment(comment_);
-  }
-
- protected:
-  std::string comment_;
-
-  void Replace(std::string &src, std::string from, std::string to) {
-    std::size_t len_from = std::strlen(from.c_str());
-    std::size_t len_to = std::strlen(to.c_str());
-    for (std::size_t pos = src.find(from); pos != std::string::npos;
-         pos = src.find(from, pos + len_to)) {
-      src.replace(pos, len_from, to);
-    }
-  }
-
-  void SetComment(std::string name, std::string op) {
-    Replace(comment_, "{ReduceOp}", name);
-    Replace(comment_, "{reduce}", op);
-  }
-};
-
-class ReduceSumOpMaker : public ReduceOpMaker {
- public:
-  ReduceSumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : ReduceOpMaker(proto, op_checker) {
-    SetComment("ReduceSum", "sum");
-    AddComment(comment_);
-  }
-};
-
-class ReduceMeanOpMaker : public ReduceOpMaker {
- public:
-  ReduceMeanOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : ReduceOpMaker(proto, op_checker) {
-    SetComment("ReduceMean", "mean");
-    AddComment(comment_);
-  }
-};
-
-class ReduceMaxOpMaker : public ReduceOpMaker {
- public:
-  ReduceMaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : ReduceOpMaker(proto, op_checker) {
-    SetComment("ReduceMax", "max");
-    AddComment(comment_);
-  }
-};
-
-class ReduceMinOpMaker : public ReduceOpMaker {
- public:
-  ReduceMinOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : ReduceOpMaker(proto, op_checker) {
-    SetComment("ReduceMin", "min");
-    AddComment(comment_);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP(reduce_sum, ops::ReduceOp, ops::ReduceSumOpMaker, reduce_sum_grad,
-            ops::ReduceGradOp);
-
-REGISTER_OP(reduce_mean, ops::ReduceOp, ops::ReduceMeanOpMaker,
-            reduce_mean_grad, ops::ReduceGradOp);
-
-REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad,
-            ops::ReduceGradOp);
-
-REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad,
-            ops::ReduceGradOp);
-
-#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)         \
-  REGISTER_OP_CPU_KERNEL(reduce_type,                                          \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           float, ops::functor>,               \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           double, ops::functor>,              \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           int, ops::functor>,                 \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           int64_t, ops::functor>);            \
-  REGISTER_OP_CPU_KERNEL(                                                      \
-      reduce_type##_grad,                                                      \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float,         \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,        \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,           \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,       \
-                            ops::grad_functor>);
-
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL);
diff --git a/paddle/operators/reduce_op.cu b/paddle/operators/reduce_op.cu
deleted file mode 100644
index 4ed1e051db4df579afe1c1ca24a06fa1baf3e13a..0000000000000000000000000000000000000000
--- a/paddle/operators/reduce_op.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/reduce_op.h"
-
-namespace ops = paddle::operators;
-
-#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor)    \
-  REGISTER_OP_CUDA_KERNEL(                                                \
-      reduce_type, ops::ReduceKernel<paddle::platform::CUDADeviceContext, \
-                                     float, ops::functor>,                \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, double,      \
-                        ops::functor>,                                    \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int,         \
-                        ops::functor>,                                    \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int64_t,     \
-                        ops::functor>);                                   \
-  REGISTER_OP_CUDA_KERNEL(                                                \
-      reduce_type##_grad,                                                 \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,   \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,  \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,     \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t, \
-                            ops::grad_functor>);
-
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL);
diff --git a/paddle/operators/reduce_op.h b/paddle/operators/reduce_op.h
deleted file mode 100644
index da5f3977769990a45c94db21f5dbd01ac70ac06e..0000000000000000000000000000000000000000
--- a/paddle/operators/reduce_op.h
+++ /dev/null
@@ -1,257 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "glog/logging.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-struct SumFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.sum(dim);
-  }
-};
-
-struct SumGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
-                  const Dim& dim, int size) {
-    dx.device(place) = dy.broadcast(dim);
-  }
-};
-
-struct MeanFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.mean(dim);
-  }
-};
-
-struct MeanGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
-                  const Dim& dim, int size) {
-    dx.device(place) = dy.broadcast(dim) / dx.constant(size);
-  }
-};
-
-struct MaxFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.maximum(dim);
-  }
-};
-
-struct MinFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.minimum(dim);
-  }
-};
-
-struct MaxOrMinGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
-                  const Dim& dim, int size) {
-    auto equals = x == y.broadcast(dim);
-    auto ones = dx.constant(1);
-    auto zeros = dx.constant(0);
-    // If there are multiple minimum or maximum elements, the subgradient of
-    // each is the set [0, 1], and we pass gradient to all of them here.
-    dx.device(place) = dy.broadcast(dim) * equals.select(ones, zeros);
-  }
-};
-
-template <typename DeviceContext, typename T, typename Functor>
-class ReduceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    if (reduce_all) {
-      // Flatten and reduce 1-D tensor
-      auto* input = context.Input<Tensor>("X");
-      auto* output = context.Output<Tensor>("Out");
-      output->mutable_data<T>(context.GetPlace());
-      auto x = EigenVector<T>::Flatten(*input);
-      auto out = EigenScalar<T>::From(*output);
-      auto& place =
-          *context.template device_context<DeviceContext>().eigen_device();
-      auto reduce_dim = Eigen::array<int, 1>({{0}});
-      Functor functor;
-      functor(place, x, out, reduce_dim);
-    } else {
-      int rank = context.Input<Tensor>("X")->dims().size();
-      switch (rank) {
-        case 1:
-          ReduceCompute<1>(context);
-          break;
-        case 2:
-          ReduceCompute<2>(context);
-          break;
-        case 3:
-          ReduceCompute<3>(context);
-          break;
-        case 4:
-          ReduceCompute<4>(context);
-          break;
-        case 5:
-          ReduceCompute<5>(context);
-          break;
-        case 6:
-          ReduceCompute<6>(context);
-          break;
-      }
-    }
-  }
-
- private:
-  template <size_t D>
-  void ReduceCompute(const framework::ExecutionContext& context) const {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<T>(context.GetPlace());
-
-    auto x = EigenTensor<T, D>::From(*input);
-    auto x_rank = static_cast<int>(x.dimensions().size());
-    int dim = static_cast<int>(context.Attr<int>("dim"));
-    if (dim < 0) dim = x_rank + dim;
-    auto reduce_dim = Eigen::array<int, 1>({{dim}});
-    // construct the squeezed output tensor
-    bool keep_dim = context.Attr<bool>("keep_dim");
-    DDim dims = output->dims();
-    auto dims_vector = vectorize(dims);
-    if (keep_dim && x_rank > 1) {
-      dims_vector.erase(dims_vector.begin() + dim);
-      dims = framework::make_ddim(dims_vector);
-    }
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    Functor functor;
-
-    if (D == 1) {
-      auto out = EigenScalar<T>::From(*output);
-      functor(place, x, out, reduce_dim);
-    } else {
-      auto out = EigenTensor<T, (D - 1)>::From(*output, dims);
-      functor(place, x, out, reduce_dim);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T, typename Functor>
-class ReduceGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    if (reduce_all) {
-      auto* input0 = context.Input<Tensor>("X");
-      auto* input1 = context.Input<Tensor>("Out");
-      auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
-      auto* output = context.Output<Tensor>(framework::GradVarName("X"));
-      output->mutable_data<T>(context.GetPlace());
-      auto x = EigenVector<T>::Flatten(*input0);
-      auto x_reduce = EigenVector<T>::From(*input1);
-      auto x_reduce_grad = EigenVector<T>::From(*input2);
-      auto x_grad = EigenVector<T>::Flatten(*output);
-      auto& place =
-          *context.template device_context<DeviceContext>().eigen_device();
-      auto broadcast_dim =
-          Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
-      Functor functor;
-      functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
-              broadcast_dim[0]);
-    } else {
-      int rank = context.Input<Tensor>("X")->dims().size();
-      switch (rank) {
-        case 1:
-          ReduceGradCompute<1>(context);
-          break;
-        case 2:
-          ReduceGradCompute<2>(context);
-          break;
-        case 3:
-          ReduceGradCompute<3>(context);
-          break;
-        case 4:
-          ReduceGradCompute<4>(context);
-          break;
-        case 5:
-          ReduceGradCompute<5>(context);
-          break;
-        case 6:
-          ReduceGradCompute<6>(context);
-          break;
-      }
-    }
-  }
-
- private:
-  template <size_t D>
-  void ReduceGradCompute(const framework::ExecutionContext& context) const {
-    auto* input0 = context.Input<Tensor>("X");
-    auto* input1 = context.Input<Tensor>("Out");
-    auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* output = context.Output<Tensor>(framework::GradVarName("X"));
-
-    output->mutable_data<T>(context.GetPlace());
-    auto x = EigenTensor<T, D>::From(*input0);
-    auto x_grad = EigenTensor<T, D>::From(*output);
-    auto x_rank = static_cast<int>(x.dimensions().size());
-    int dim = static_cast<int>(context.Attr<int>("dim"));
-    if (dim < 0) dim = x_rank + dim;
-    DDim dims = input0->dims();
-    dims[dim] = 1;
-    auto x_reduce = EigenTensor<T, D>::From(*input1, dims);
-    auto x_reduce_grad = EigenTensor<T, D>::From(*input2, dims);
-
-    Eigen::array<int, D> broadcast_dim;
-    for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
-    broadcast_dim[dim] = input0->dims()[dim];
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    Functor functor;
-    functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
-            broadcast_dim[dim]);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define FOR_EACH_KERNEL_FUNCTOR(__macro)                \
-  __macro(reduce_sum, SumFunctor, SumGradFunctor);      \
-  __macro(reduce_mean, MeanFunctor, MeanGradFunctor);   \
-  __macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \
-  __macro(reduce_min, MinFunctor, MaxOrMinGradFunctor);
diff --git a/paddle/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
deleted file mode 100644
index 3c30447949421da516213b47178828453671c693..0000000000000000000000000000000000000000
--- a/paddle/operators/reorder_lod_tensor_by_rank_op.cc
+++ /dev/null
@@ -1,270 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/detail/safe_ref.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-class ReorderLoDTensorByRankTableOpProtoMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  ReorderLoDTensorByRankTableOpProtoMaker(OpProto *proto,
-                                          OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(LoDTensor), the input lod tensor to be reordered according to "
-             "Input(RankTable).");
-    AddInput("RankTable",
-             "(LoDRankTable), the rank table according to which Input(X) is "
-             "reordered.");
-    AddOutput("Out", "(LoDTensor), the reordered lod tensor.");
-    AddComment(R"DOC(ReorderLoDTensorByRankTable operator.
-
-Input(X) is a batch of sequences. Input(RankTable) stores new orders of the
-input sequence batch. The reorder_lod_tensor_by_rank operator reorders the
-Input(X) according to the information provided by Input(RankTable).
-
-For example:
-
-If the indices stored in the Input(RankTable) are [3, 0, 2, 1], the
-Input(X) will be reordered that the fourth sequence in Input(X) will become the
-first one, and then followed by the original first, third, and the second one.
-
-This is:
-X = [Seq0, Seq1, Seq2, Seq3]. The indices in RankTable are [3, 0, 2, 1].
-Out =  [Seq3, Seq0, Seq2, Seq1] with a new LoD information.
-
-If the LoD information of Input(X) is empty, this means Input(X) is not sequence
-data. This is also identical to a batch of sequences where each sequence has a
-fixed length 1. In this case, the reorder_lod_tensor_by_rank operator reorders
-each slice of Input(X) along the first axis according to Input(RankTable).
-
-This is:
-X = [Slice0, Slice1, Slice2, Slice3] and its LoD information is empty. The
-indices in RankTable are [3, 0, 2, 1].
-Out = [Slice3, Slice0, Slice2, Slice1] with no LoD information is appended.
-
-NOTE: This operator sorts Input(X) according to a given LoDRankTable which does
-not need to be calculated according to Input(X). It can be calculated according
-to another different sequence, and then this operator sorts Input(X) according
-to the given LoDRankTable.
-
-)DOC");
-  }
-};
-
-class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
- public:
-  ReorderLoDTensorByRankTableBase(const std::string &type,
-                                  const framework::VariableNameMap &inputs,
-                                  const framework::VariableNameMap &outputs,
-                                  const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    auto &x =
-        detail::Ref(scope.FindVar(Input("X")),
-                    "Cannot find input lod tensor variable %s", Input("X"))
-            .Get<framework::LoDTensor>();
-    auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")),
-                                   "Cannot find input rank table variable %s",
-                                   Input("RankTable"))
-                           .Get<framework::LoDRankTable>();
-    auto &out =
-        *detail::Ref(scope.FindVar(Output("Out")),
-                     "Cannot find output lod tensor variable %s", Output("Out"))
-             .GetMutable<framework::LoDTensor>();
-
-    out.Resize(x.dims());
-    out.mutable_data(x.place(), x.type());
-    this->process(place, x, rank_table, &out);
-  }
-
- protected:
-  virtual void process(const platform::Place &place,
-                       const framework::LoDTensor &x,
-                       const framework::LoDRankTable &rank_table,
-                       framework::LoDTensor *out) const = 0;
-
-  struct AbsoluteRankTableItem {
-    size_t offset;  // the absolute/accumulated offset.
-    size_t length;  // the length
-    framework::LoD lod;
-  };
-
-  std::vector<AbsoluteRankTableItem> GetAbsoluteOffsetAndLengthByLoDRankTable(
-      const framework::LoDTensor &x) const {
-    std::vector<AbsoluteRankTableItem> absolute_table;
-
-    if (x.lod().empty()) {
-      // For Tensor without lod, such as the output of sequence_pool_op
-      size_t size = x.dims()[0];
-      absolute_table.reserve(size);
-      for (size_t i = 0; i < size; ++i) {
-        absolute_table.emplace_back();
-        absolute_table.back().length = 1;
-        absolute_table.back().offset = i;
-      }
-    } else {
-      size_t level = 0;
-      size_t size = x.lod()[level].size();
-
-      for (size_t i = 0; i < size - 1; ++i) {
-        auto lod_offset =
-            framework::GetSubLoDAndAbsoluteOffset(x.lod(), i, i + 1, level);
-
-        auto &offset = lod_offset.second;
-
-        absolute_table.emplace_back();
-        absolute_table.back().length = offset.second - offset.first;
-        absolute_table.back().offset = offset.first;
-        absolute_table.back().lod = lod_offset.first;
-      }
-    }
-
-    return absolute_table;
-  }
-
-  size_t CopyTensorAndLod(const platform::Place &place,
-                          const AbsoluteRankTableItem &item,
-                          const framework::LoDTensor &x,
-                          framework::LoDTensor *out, size_t out_offset) const {
-    auto &out_lod = *out->mutable_lod();
-    auto len = item.length;
-    auto x_offset = item.offset;
-
-    if (out_lod.empty()) {
-      for (size_t i = 0; i < item.lod.size(); ++i) {
-        out_lod.push_back(std::vector<size_t>({0}));
-      }
-    }
-
-    for (size_t i = 0; i < out_lod.size(); ++i) {
-      auto &out_v = out_lod[i];
-      auto &new_lod_v = item.lod[i];
-
-      for (auto &detail : new_lod_v) {
-        out_v.push_back(out_v.back() + detail);
-      }
-    }
-
-    auto x_sliced = x.Slice(x_offset, x_offset + len);
-    auto out_sliced = out->Slice(out_offset, out_offset + len);
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    framework::Copy(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
-    out_offset += len;
-    return out_offset;
-  }
-};
-
-class ReorderLoDTensorByRankTableOp : public ReorderLoDTensorByRankTableBase {
- public:
-  ReorderLoDTensorByRankTableOp(const std::string &type,
-                                const framework::VariableNameMap &inputs,
-                                const framework::VariableNameMap &outputs,
-                                const framework::AttributeMap &attrs)
-      : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
-
- protected:
-  void process(const platform::Place &place, const framework::LoDTensor &x,
-               const framework::LoDRankTable &rank_table,
-               framework::LoDTensor *out) const override {
-    auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
-    size_t out_offset = 0;
-    out->mutable_lod()->clear();
-    for (auto &item : rank_table.items()) {
-      PADDLE_ENFORCE_LT(item.index, absolute_table.size());
-      out_offset = CopyTensorAndLod(place, absolute_table[item.index], x, out,
-                                    out_offset);
-    }
-  }
-};
-
-class IdentityInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    context->SetOutputDim("Out", context->GetInputDim("X"));
-  }
-};
-
-class ReorderLodTensorByRankGradOpMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("reorder_lod_tensor_by_rank_grad");
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetInput("RankTable", Input("RankTable"));
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-class ReorderLoDTensorByRankGradOp : public ReorderLoDTensorByRankTableBase {
- public:
-  ReorderLoDTensorByRankGradOp(const std::string &type,
-                               const framework::VariableNameMap &inputs,
-                               const framework::VariableNameMap &outputs,
-                               const framework::AttributeMap &attrs)
-      : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
-
- protected:
-  void process(const platform::Place &place, const framework::LoDTensor &x,
-               const framework::LoDRankTable &rank_table,
-               framework::LoDTensor *out) const override {
-    auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
-
-    // offsets = enumerate([item.index for item in rank_table.items()])
-    std::vector<std::pair<size_t, size_t>> offsets;
-    offsets.reserve(rank_table.items().size());
-    for (size_t i = 0; i < rank_table.items().size(); ++i) {
-      offsets.push_back({i, rank_table.items()[i].index});
-    }
-
-    // offsets.sort(key=lambda x: x[1])
-    std::sort(
-        offsets.begin(), offsets.end(),
-        [](const std::pair<size_t, size_t> &a,
-           const std::pair<size_t, size_t> &b) { return a.second < b.second; });
-
-    // Copy TensorAndLod
-    size_t out_offset = 0;
-    for (auto &offset : offsets) {
-      out_offset = this->CopyTensorAndLod(place, absolute_table[offset.first],
-                                          x, out, out_offset);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(reorder_lod_tensor_by_rank,
-                  ops::ReorderLoDTensorByRankTableOp,
-                  ops::ReorderLodTensorByRankGradOpMaker,
-                  ops::ReorderLoDTensorByRankTableOpProtoMaker,
-                  ops::IdentityInferShape);
-REGISTER_OPERATOR(reorder_lod_tensor_by_rank_grad,
-                  ops::ReorderLoDTensorByRankGradOp, ops::IdentityInferShape);
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
deleted file mode 100644
index b9743a5df1092917d13a50aa20ea7e7c52b8d151..0000000000000000000000000000000000000000
--- a/paddle/operators/reshape_op.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/reshape_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ReshapeOp : public framework::OperatorWithKernel {
- public:
-  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    // input check
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ReshapeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ReshapeOp should not be null.");
-
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty.");
-    auto x_dims = ctx->GetInputDim("X");
-
-    std::vector<size_t> neg_dims_idx;
-    // set some dimension to -1 if it is unknown
-    const int unknown_size = -1;
-    for (size_t i = 0; i < shape.size(); ++i) {
-      PADDLE_ENFORCE(shape[i] > 0 || shape[i] == unknown_size,
-                     "Each dimension of Attr(shape) must be positive or %d.",
-                     unknown_size);
-      if (shape[i] == unknown_size) {
-        neg_dims_idx.push_back(i);
-        PADDLE_ENFORCE(neg_dims_idx.size() <= 1,
-                       "Only one dimension of Attr(shape) can be unknown.");
-      }
-    }
-
-    int64_t capacity =
-        std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
-    int64_t in_size = framework::product(x_dims);
-    if (neg_dims_idx.size() == 1) {
-      // dim infer
-      shape[neg_dims_idx[0]] = in_size / (-capacity);
-      // recalculate capacity
-      capacity = shape[neg_dims_idx[0]] * (-capacity);
-    }
-    // capacity check
-    PADDLE_ENFORCE(capacity == in_size,
-                   "The size of Input(X) mismatches with Attr(shape).");
-    // resize output
-    std::vector<int64_t> shape_int64(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-    auto out_dims = framework::make_ddim(shape_int64);
-    ctx->SetOutputDim("Out", out_dims);
-    if (shape[0] == x_dims[0]) {
-      // Only pass LoD when the first dimension is equal between
-      // output and input.
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-};
-
-class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input tensor of reshape operator.");
-    AddOutput("Out", "The output tensor of reshape operator.");
-    AddAttr<std::vector<int>>("shape",
-                              "(vector<int>) "
-                              "Target shape of reshape operator.");
-    AddComment(R"DOC(
-Reshape Operator.
-
-Reshape Input(X) into the shape specified by Attr(shape).
-
-An example:
-Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]]
-
-and target shape = [1, 4], the reshape operator will transform
-the tensor X into a 2-D tensor: [[1, 2, 3, 4]]
-
-One dimension in the target shape can be set -1, representing that its
-size is unknown. In this case, the real dimension will be infered from 
-the original shape of Input(X) and other dimensions in the target shape.
-)DOC");
-  }
-};
-
-class ReshapeGradOp : public framework::OperatorWithKernel {
- public:
-  ReshapeGradOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) shouldn't be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OP(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, reshape_grad,
-            ops::ReshapeGradOp);
-REGISTER_OP_CPU_KERNEL(reshape,
-                       ops::ReshapeKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    reshape_grad, ops::ReshapeGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/reshape_op.cu b/paddle/operators/reshape_op.cu
deleted file mode 100644
index f487e43b99d5be2af299a9edd91dcda0c4eb7b99..0000000000000000000000000000000000000000
--- a/paddle/operators/reshape_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/reshape_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    reshape,
-    paddle::operators::ReshapeKernel<paddle::platform::CUDAPlace, float>);
-REGISTER_OP_CUDA_KERNEL(
-    reshape_grad,
-    paddle::operators::ReshapeGradKernel<paddle::platform::CUDAPlace, float>);
diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h
deleted file mode 100644
index d884b03cadbf32de2c44e1aaed52fee9304534eb..0000000000000000000000000000000000000000
--- a/paddle/operators/reshape_op.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ReshapeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto out_dims = out->dims();
-    out->mutable_data<T>(ctx.GetPlace());
-    framework::Copy(*in, ctx.GetPlace(), ctx.device_context(), out);
-    out->Resize(out_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReshapeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    d_x->mutable_data<T>(ctx.GetPlace());
-
-    auto in_dims = d_x->dims();
-    framework::Copy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
-    d_x->Resize(in_dims);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/rmsprop_op.cc b/paddle/operators/rmsprop_op.cc
deleted file mode 100644
index f7c250bf913b9213e7d7e2cca9ecadf74cac91a1..0000000000000000000000000000000000000000
--- a/paddle/operators/rmsprop_op.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/rmsprop_op.h"
-
-namespace paddle {
-namespace operators {
-
-class RmspropOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("MeanSquare"),
-                   "Input(MeanSquare) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment"),
-                   "Input(Moment) of RmspropOp should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(param_out) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
-                   "Output(Momentum_out) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"),
-                   "Output(MeanSquareOut) of RmspropOp should not be null.");
-
-    auto param_dim = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dim, ctx->GetInputDim("Grad"),
-        "Param and grad input of RmspropOp should have the same dimension.");
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Moment"),
-                      "Param and Momentum input of RmspropOp "
-                      "should have the same dimension.");
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("MeanSquare"),
-                      "Param and Momentum input of RmspropOp "
-                      "should have the same dimension.");
-
-    auto lr_dim = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
-                      "Learning Rate should be a scalar.");
-
-    ctx->SetOutputDim("ParamOut", param_dim);
-    ctx->SetOutputDim("MomentOut", param_dim);
-    ctx->SetOutputDim("MeanSquareOut", param_dim);
-  }
-};
-
-class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  RmspropOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Param",
-             "(Tensor, default Tensor<float>) "
-             "Input parameter value that has to be updated.");
-    AddInput("MeanSquare",
-             "(Tensor, default Tensor<float>)"
-             " The mean square value that gets updated.");
-    AddInput("LearningRate",
-             "(Tensor, default Tensor<float>) "
-             "The learning rate should be a tensor of size 1.");
-    AddInput("Grad",
-             "(Tensor, default Tensor<float>) "
-             "Input gradient of the parameter.");
-    AddInput("Moment",
-             "(Tensor, default Tensor<float>) The moment that gets updated.");
-
-    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
-    AddOutput("MomentOut", "(Tensor) Output updated moment.");
-    AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value.");
-
-    AddAttr<float>("epsilon",
-                   "(float, default 1e-10) Constant "
-                   "for numerical stability.")
-        .SetDefault(1.0e-10f);
-    AddAttr<float>("decay",
-                   "(float, default 0.9) "
-                   "Discounting factor for coming gradient.")
-        .SetDefault(0.9f);
-    AddAttr<float>("momentum", "(float, default 0.0) Constant value.")
-        .SetDefault(0.0f);
-    AddComment(R"DOC(
-Rmsprop Optimizer. 
-
-$$
-MeanSquareOut = decay * MeanSquare + (1 - decay) * Grad * Grad \\
-MomentOut = momentum * Moment +
-            \frac{LearningRate * Grad}{\sqrt{MeanSquareOut + epsilon}} \\
-ParamOut = Param -  MomentOut
-$$
-
-The original slides that proposed Rmsprop: Slide 29 of
-http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/rmsprop_op.cu b/paddle/operators/rmsprop_op.cu
deleted file mode 100644
index 0295dc262f095a2b58ab34e3c2ec9f5440e4bfca..0000000000000000000000000000000000000000
--- a/paddle/operators/rmsprop_op.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/rmsprop_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    rmsprop, ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/rmsprop_op.h b/paddle/operators/rmsprop_op.h
deleted file mode 100644
index 16a561835d02457cf2268f713289001773e63d6c..0000000000000000000000000000000000000000
--- a/paddle/operators/rmsprop_op.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class RmspropOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* param_out = ctx.Output<Tensor>("ParamOut");
-    auto* moment_out = ctx.Output<Tensor>("MomentOut");
-    auto* mean_square_out = ctx.Output<Tensor>("MeanSquareOut");
-
-    auto grad = ctx.Input<Tensor>("Grad");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-    moment_out->mutable_data<T>(ctx.GetPlace());
-    mean_square_out->mutable_data<T>(ctx.GetPlace());
-
-    float epsilon = ctx.Attr<float>("epsilon");
-    float rho = ctx.Attr<float>("decay");
-    float momentum = ctx.Attr<float>("momentum");
-
-    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
-    auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare"));
-    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
-    auto g = EigenVector<T>::Flatten(*grad);
-    auto mom = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment"));
-
-    auto p_out = EigenVector<T>::Flatten(*param_out);
-    auto mom_out = EigenVector<T>::Flatten(*moment_out);
-    auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
-
-    ms_out.device(place) = rho * ms + (1 - rho) * g * g;
-    mom_out.device(place) =
-        momentum * mom +
-        lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
-    p_out.device(place) = p - mom_out;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/rnn_memory_helper_op.cc b/paddle/operators/rnn_memory_helper_op.cc
deleted file mode 100644
index eb55ed6a05b51d7a6c63d16fcf5aff73f6744903..0000000000000000000000000000000000000000
--- a/paddle/operators/rnn_memory_helper_op.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-class RNNMemoryHelperOp : public framework::OperatorBase {
- public:
-  RNNMemoryHelperOp(const std::string &type,
-                    const framework::VariableNameMap &inputs,
-                    const framework::VariableNameMap &outputs,
-                    const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
-    auto mem_var_name = Input("X");
-    auto *mem_var = scope.FindVar(mem_var_name);
-    PADDLE_ENFORCE(mem_var != nullptr,
-                   "Cannot find mem_var in scope, mem_var_name is %s",
-                   mem_var_name);
-
-    auto out_name = this->Output("Out");
-    auto *out_var = scope.FindVar(out_name);
-    PADDLE_ENFORCE(out_var != nullptr,
-                   "Cannot find out_var in scope, out_var_name is %s",
-                   out_name);
-
-    auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
-    auto &mem_tensor = mem_var->Get<framework::LoDTensor>();
-    out_tensor->ShareDataWith(mem_tensor);
-    out_tensor->set_lod(mem_tensor.lod());
-  }
-};
-
-class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class RNNMemoryHelperOpInfoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  RNNMemoryHelperOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "");
-    AddOutput("Out", "");
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
-        .SetDefault(framework::proto::DataType::FP32);
-    AddComment("");
-  }
-};
-
-class RNNMemoryHelperGradOp : public framework::OperatorBase {
- public:
-  RNNMemoryHelperGradOp(const std::string &type,
-                        const framework::VariableNameMap &inputs,
-                        const framework::VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
-    auto out_grad_var_name = Input(framework::GradVarName("Out"));
-    auto *out_grad_var = scope.FindVar(out_grad_var_name);
-
-    auto in_grad_var_name = Output(framework::GradVarName("X"));
-    auto *in_grad_var = scope.FindVar(in_grad_var_name);
-    PADDLE_ENFORCE(in_grad_var != nullptr,
-                   "Cannot find in_grad_var in scope, name is %s",
-                   in_grad_var_name);
-
-    if (out_grad_var == nullptr) {
-      VLOG(5) << "Using fill constant 0 as starting gradient";
-      auto in_var_name = Input("X");
-      auto *in_var = scope.FindVar(in_var_name);
-      auto &in_var_tensor = in_var->Get<framework::LoDTensor>();
-
-      framework::AttributeMap attrs;
-      attrs["dtype"] = framework::ToDataType(in_var_tensor.type());
-      attrs["shape"] = framework::vectorize2int(in_var_tensor.dims());
-      attrs["value"] = 0.0f;
-
-      auto zero_op = framework::OpRegistry::CreateOp(
-          "fill_constant", {}, {{"Out", {in_grad_var_name}}}, attrs);
-      zero_op->Run(scope, dev_place);
-    } else {
-      auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>();
-      auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>();
-      in_grad_tensor->ShareDataWith(out_grad_tensor);
-      in_grad_tensor->set_lod(out_grad_tensor.lod());
-    }
-  }
-};
-
-class RNNMemoryHelperGradOpInfoMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  RNNMemoryHelperGradOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(framework::GradVarName("Out"), "");
-    AddInput("X", "");
-    AddInput("Out", "");
-    AddOutput(framework::GradVarName("X"), "");
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
-        .SetDefault(framework::proto::DataType::FP32);
-    AddComment("");
-  }
-};
-
-class RNNMemoryHelperGradOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    auto x_grad_name = framework::GradVarName("X");
-    PADDLE_ENFORCE(ctx->HasOutput(x_grad_name), "");
-    PADDLE_ENFORCE(ctx->HasInput("X"), "");
-    ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ x_grad_name);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(rnn_memory_helper, paddle::operators::RNNMemoryHelperOp,
-                  paddle::operators::RNNMemoryHelperOpInfoMaker,
-                  paddle::operators::RNNMemoryHelperOpShapeInference,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(rnn_memory_helper_grad,
-                  paddle::operators::RNNMemoryHelperGradOp,
-                  paddle::operators::RNNMemoryHelperGradOpInfoMaker,
-                  paddle::operators::RNNMemoryHelperGradOpShapeInference);
diff --git a/paddle/operators/roi_pool_op.cc b/paddle/operators/roi_pool_op.cc
deleted file mode 100644
index a7351f11c5da7b850681346942ad699aba85a8e0..0000000000000000000000000000000000000000
--- a/paddle/operators/roi_pool_op.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/roi_pool_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-static constexpr int kROISize = 5;
-
-class ROIPoolOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ROIPoolOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("ROIs"),
-                   "Input(ROIs) of ROIPoolOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ROIPoolOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Argmax"),
-                   "Output(Argmax) of ROIPoolOp should not be null.");
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-
-    PADDLE_ENFORCE(input_dims.size() == 4,
-                   "The format of input tensor is NCHW.");
-    PADDLE_ENFORCE(rois_dims.size() == 2,
-                   "ROIs should be a 2-D tensor of shape (num_rois, 5)"
-                   "given as [[batch_id, x1, y1, x2, y2], …].");
-    PADDLE_ENFORCE(rois_dims[1] == kROISize,
-                   "ROIs should be a 2-D tensor of shape (num_rois, 5)"
-                   "given as [[batch_id, x1, y1, x2, y2], …].");
-
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-
-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      "The pooled output height must greater than 0");
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      "The pooled output width must greater than 0");
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      "The spatial scale must greater than 0");
-
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] = input_dims[1];
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->SetOutputDim("Argmax", out_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
-  }
-};
-
-class ROIPoolGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "The gradient of Out should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
-                   "The gradient of X should not be null.");
-    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
-  }
-};
-
-class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ROIPoolOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(Tensor), "
-             "the input of ROIPoolOp. "
-             "The format of input tensor is NCHW. Where N is batch size, "
-             "C is the number of input channels, "
-             "H is the height of the feature, and "
-             "W is the width of the feature.");
-    AddInput("ROIs",
-             "(Tensor), "
-             "ROIs (Regions of Interest) to pool over. "
-             "should be a 2-D tensor of shape (num_rois, 5)"
-             "given as [[batch_id, x1, y1, x2, y2], …]. "
-             "Where batch_id is the id of the data, "
-             "(x1, y1) is the top left coordinates, and "
-             "(x2, y2) is the bottom right coordinates.");
-    AddOutput("Out",
-              "(Tensor), "
-              "The output of ROIPoolOp is a 4-D tensor with shape "
-              "(num_rois, channels, pooled_h, pooled_w).");
-    AddOutput("Argmax",
-              "(Tensor), "
-              "Argmaxes corresponding to indices in X used "
-              "for gradient computation. Only output "
-              "if arg “is_test” is false.")
-        .AsIntermediate();
-    AddAttr<float>("spatial_scale",
-                   "(float, default 1.0), "
-                   "Multiplicative spatial scale factor "
-                   "to translate ROI coords from their input scale "
-                   "to the scale used when pooling.")
-        .SetDefault(1.0);
-    AddAttr<int>("pooled_height",
-                 "(int, default 1), "
-                 "The pooled output height.")
-        .SetDefault(1);
-    AddAttr<int>("pooled_width",
-                 "(int, default 1), "
-                 "The pooled output width.")
-        .SetDefault(1);
-    AddComment(R"DOC(
-ROIPool operator
-
-ROI Pooling for Faster-RCNN. The link below is a further introduction: 
-https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad,
-            ops::ROIPoolGradOp);
-REGISTER_OP_CPU_KERNEL(
-    roi_pool,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    roi_pool_grad,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/roi_pool_op.cu b/paddle/operators/roi_pool_op.cu
deleted file mode 100644
index a874befe4d12029afa9ce55230da22cb048000aa..0000000000000000000000000000000000000000
--- a/paddle/operators/roi_pool_op.cu
+++ /dev/null
@@ -1,209 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/roi_pool_op.h"
-#include "paddle/platform/cuda_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaxinumNumBlocks = 4096;
-static constexpr int kROISize = 5;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-
-template <typename T>
-__global__ void GPUROIPoolForward(const int nthreads, const T* input_data,
-                                  const int64_t* input_rois,
-                                  const float spatial_scale, const int channels,
-                                  const int height, const int width,
-                                  const int pooled_height,
-                                  const int pooled_width, T* output_data,
-                                  int64_t* argmax_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    const int64_t* offset_input_rois = input_rois + n * kROISize;
-    int roi_batch_ind = offset_input_rois[0];
-    int roi_start_w = round(offset_input_rois[1] * spatial_scale);
-    int roi_start_h = round(offset_input_rois[2] * spatial_scale);
-    int roi_end_w = round(offset_input_rois[3] * spatial_scale);
-    int roi_end_h = round(offset_input_rois[4] * spatial_scale);
-
-    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
-    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
-    int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
-    int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
-    int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
-
-    hstart = min(max(hstart + roi_start_h, 0), height);
-    hend = min(max(hend + roi_start_h, 0), height);
-    wstart = min(max(wstart + roi_start_w, 0), width);
-    wend = min(max(wend + roi_start_w, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
-    int maxidx = -1;
-    const T* offset_input_data =
-        input_data + (roi_batch_ind * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        int input_data_index = h * width + w;
-        if (offset_input_data[input_data_index] > maxval) {
-          maxval = offset_input_data[input_data_index];
-          maxidx = input_data_index;
-        }
-      }
-    }
-    output_data[index] = maxval;
-    if (argmax_data) {
-      argmax_data[index] = maxidx;
-    }
-  }
-}
-
-template <typename T>
-__global__ void GPUROIPoolBackward(
-    const int nthreads, const int64_t* input_rois, const T* output_grad,
-    const int64_t* argmax_data, const int num_rois, const float spatial_scale,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, T* input_grad) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    const int64_t* offset_input_rois = input_rois + n * kROISize;
-    int roi_batch_ind = offset_input_rois[0];
-    int input_offset = (roi_batch_ind * channels + c) * height * width;
-    int output_offset = (n * channels + c) * pooled_height * pooled_width;
-    const T* offset_output_grad = output_grad + output_offset;
-    T* offset_input_grad = input_grad + input_offset;
-    const int64_t* offset_argmax_data = argmax_data + output_offset;
-
-    int argmax = offset_argmax_data[ph * pooled_width + pw];
-    if (argmax != -1) {
-      platform::CudaAtomicAdd(
-          offset_input_grad + argmax,
-          static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
-    }
-  }
-}
-
-template <typename Place, typename T>
-class GPUROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<Tensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto* argmax = ctx.Output<Tensor>("Argmax");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    auto in_stride = framework::stride(in_dims);
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    size_t rois_num = rois->dims()[0];
-    if (rois_num == 0) return;
-
-    int output_size = out->numel();
-    int blocks = NumBlocks(output_size);
-    int threads = kNumCUDAThreads;
-
-    GPUROIPoolForward<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-        output_size, in->data<T>(), rois->data<int64_t>(), spatial_scale,
-        channels, height, width, pooled_height, pooled_width,
-        out->mutable_data<T>(ctx.GetPlace()),
-        argmax->mutable_data<int64_t>(ctx.GetPlace()));
-  }
-};
-
-template <typename Place, typename T>
-class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<Tensor>("ROIs");
-    auto* argmax = ctx.Input<Tensor>("Argmax");
-
-    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    size_t rois_num = rois->dims()[0];
-    int channels = in->dims()[1];
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (x_grad) {
-      x_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<Place, T> set_zero;
-      set_zero(ctx.cuda_device_context(), x_grad, static_cast<T>(0));
-
-      int output_grad_size = out_grad->numel();
-      int blocks = NumBlocks(output_grad_size);
-      int threads = kNumCUDAThreads;
-
-      if (output_grad_size > 0) {
-        GPUROIPoolBackward<
-            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-            output_grad_size, rois->data<int64_t>(), out_grad->data<T>(),
-            argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
-            width, pooled_height, pooled_width,
-            x_grad->mutable_data<T>(ctx.GetPlace()));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    roi_pool,
-    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    roi_pool_grad,
-    ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/roi_pool_op.h b/paddle/operators/roi_pool_op.h
deleted file mode 100644
index 09a9d3d870c1066f1c6f780c4b3203679e9e7505..0000000000000000000000000000000000000000
--- a/paddle/operators/roi_pool_op.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class CPUROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::Tensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* argmax = ctx.Output<framework::Tensor>("Argmax");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    auto in_stride = framework::stride(in_dims);
-    auto argmax_stride = framework::stride(argmax->dims());
-    auto roi_stride = framework::stride(rois->dims());
-    auto out_stride = framework::stride(out->dims());
-
-    const T* input_data = in->data<T>();
-    const int64_t* rois_data = rois->data<int64_t>();
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
-
-    for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_id = rois_data[0];
-      PADDLE_ENFORCE_GE(roi_batch_id, 0);
-      PADDLE_ENFORCE_LT(roi_batch_id, batch_size);
-      rois_data += roi_stride[0];
-    }
-
-    rois_data = rois->data<int64_t>();
-    for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_id = rois_data[0];
-      int roi_start_w = round(rois_data[1] * spatial_scale);
-      int roi_start_h = round(rois_data[2] * spatial_scale);
-      int roi_end_w = round(rois_data[3] * spatial_scale);
-      int roi_end_h = round(rois_data[4] * spatial_scale);
-
-      // Force malformed ROIs to be 1x1
-      int roi_height = std::max(roi_end_h - roi_start_h + 1, 1);
-      int roi_width = std::max(roi_end_w - roi_start_w + 1, 1);
-
-      const float bin_size_h =
-          static_cast<float>(roi_height) / static_cast<float>(pooled_height);
-      const float bin_size_w =
-          static_cast<float>(roi_width) / static_cast<float>(pooled_width);
-
-      const T* batch_data = input_data + roi_batch_id * in_stride[0];
-
-      for (int c = 0; c < channels; ++c) {
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            //  Compute pooling region for this output unit:
-            //  start (included) = floor(ph * roi_height / pooled_height_)
-            //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
-            int hstart =
-                static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
-            int wstart =
-                static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
-            int hend =
-                static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
-            int wend =
-                static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));
-
-            hstart = std::min(std::max(hstart + roi_start_h, 0), height);
-            hend = std::min(std::max(hend + roi_start_h, 0), height);
-            wstart = std::min(std::max(wstart + roi_start_w, 0), width);
-            wend = std::min(std::max(wend + roi_start_w, 0), width);
-
-            const int pool_index = ph * pooled_width + pw;
-
-            // Define an empty pooling region to be zero
-            bool is_empty = (hend <= hstart) || (wend <= wstart);
-            output_data[pool_index] =
-                is_empty ? 0 : -std::numeric_limits<T>::max();
-            argmax_data[pool_index] = -1;
-
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                const int index = h * width + w;
-                if (batch_data[index] > output_data[pool_index]) {
-                  output_data[pool_index] = batch_data[index];
-                  argmax_data[pool_index] = index;
-                }
-              }
-            }
-          }
-        }
-
-        batch_data += in_stride[1];
-        output_data += out_stride[1];
-        argmax_data += argmax_stride[1];
-      }
-      // Increment ROI data pointer
-      rois_data += roi_stride[0];
-    }
-    return;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::Tensor>("ROIs");
-    auto* argmax = ctx.Input<framework::Tensor>("Argmax");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-
-    if (in_grad) {
-      const int64_t* rois_data = rois->data<int64_t>();
-      const T* out_grad_data = out_grad->data<T>();
-      const int64_t* argmax_data = argmax->data<int64_t>();
-      T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.template device_context<DeviceContext>(), in_grad,
-               static_cast<T>(0));
-
-      auto in_stride = framework::stride(in->dims());
-      auto argmax_stride = framework::stride(argmax->dims());
-      auto roi_stride = framework::stride(rois->dims());
-      auto out_stride = framework::stride(out_grad->dims());
-
-      int rois_num = rois->dims()[0];
-      int channels = in->dims()[1];
-
-      for (int n = 0; n < rois_num; ++n) {
-        int roi_batch_idx = rois_data[0];
-        T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0];
-        for (int c = 0; c < channels; ++c) {
-          for (int ph = 0; ph < pooled_height; ++ph) {
-            for (int pw = 0; pw < pooled_width; ++pw) {
-              int pool_index = ph * pooled_width + pw;
-              if (argmax_data[pool_index] >= 0) {
-                auto index = argmax_data[pool_index];
-                batch_grad_data[index] += out_grad_data[pool_index];
-              }
-            }
-          }
-          batch_grad_data += in_stride[1];
-          out_grad_data += out_stride[1];
-          argmax_data += argmax_stride[1];
-        }
-        rois_data += roi_stride[0];
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/row_conv_op.cc b/paddle/operators/row_conv_op.cc
deleted file mode 100644
index 68f4e3531566fd346055404d45651c2b53ebe31b..0000000000000000000000000000000000000000
--- a/paddle/operators/row_conv_op.cc
+++ /dev/null
@@ -1,259 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/row_conv_op.h"
-#include "paddle/framework/eigen.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using framework::Tensor;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-class RowConvOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of RowConvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                   "Input(Filter) of RowConvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of RowConvOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto filter_dims = ctx->GetInputDim("Filter");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(filter_dims.size(), 2, "Input(Y)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(
-        x_dims[1], filter_dims[1],
-        "The 2nd dimension of Input(X) and Input(Filter) should be same.");
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", "Out");
-  }
-};
-
-class RowConvGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                   "Input(Filter) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Gradient of output(Out) should not be null.");
-
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      auto x_dims = ctx->GetInputDim("X");
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-
-    auto filter_grad_name = framework::GradVarName("Filter");
-    if (ctx->HasOutput(filter_grad_name)) {
-      auto filter_dims = ctx->GetInputDim("Filter");
-      ctx->SetOutputDim(filter_grad_name, filter_dims);
-    }
-  }
-};
-
-class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  RowConvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(LoDTensor), the input(X) is a LodTensor, which supports "
-             "variable time-length input sequences. The underlying tensor "
-             "in this LoDTensor is a matrix with shape (T x N), where T "
-             "is the total time steps in this mini-batch and N is the input "
-             "data dimension.");
-    AddInput("Filter",
-             "(Tensor), the input(Filter) is a learnable parameter. It "
-             "is a 2-D tensor with shape (future_context x N), where, "
-             "future_context is the future context length and N is the data "
-             "dimension.");
-    AddOutput("Out",
-              "(LoDTensor), the output(Out) is a LodTensor, which supports "
-              "variable time-length input sequences. The underlying tensor "
-              "in this LodTensor is a matrix with shape T x N, i.e., the "
-              "same shape as X.");
-    AddComment(R"DOC(
-Row-convolution Operator.
-
-The row convolution is called lookahead convolution.  This operator was 
-introduced in the following paper for DeepSpeech2:
-http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf 
-
-The main motivation is that a bidirectional RNN, useful in DeepSpeech 
-like speech models, learns representation for a sequence by performing a 
-forward and a backward pass through the entire sequence. However, unlike 
-unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online
-and low-latency setting. The lookahead convolution incorporates information 
-from future subsequences in a computationally efficient manner to improve 
-unidirectional recurrent neural networks. The row convolution operator is 
-different from the 1D sequence convolution, and is computed as follows:
-
-Given an input sequence $in$ of length $t$ and input dimension $d$, 
-and a filter ($W$) of size $context \times d$, 
-the output sequence is convolved as:
-
-$$
-out_{i, :} = \sum_{j=i}^{i + context} in_{j,:} \dot W_{i-j, :}
-$$
-
-)DOC");
-  }
-};
-
-template <typename T>
-class RowConvKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<LoDTensor>("X");
-    auto *filter = context.Input<Tensor>("Filter");
-    auto *out = context.Output<LoDTensor>("Out");
-
-    out->mutable_data<T>(context.GetPlace());
-
-    auto batch_indices = x->lod()[0];
-    auto input_dim = x->dims()[1];  // 'in' is of size T x N
-    size_t num_sequence = batch_indices.size() - 1;
-
-    auto future_context = filter->dims()[0];
-    auto weights = EigenMatrix<T>::From(*filter);
-
-    for (size_t i = 0; i < num_sequence; i++) {
-      int start = static_cast<int>(batch_indices[i]);
-      int end = static_cast<int>(batch_indices[i + 1]);
-      int current_timesteps = end - start;
-      Tensor cur_input_sequence =
-          x->Slice(start, end);  // Current input sequence
-      Tensor cur_output_sequence =
-          out->Slice(start, end);  // Current output sequence
-      auto cip_seq = EigenMatrix<T>::From(cur_input_sequence);
-      auto cot_seq = EigenMatrix<T>::From(cur_output_sequence);
-
-      for (int k = 0; k < current_timesteps;
-           k++) {  // For different time steps in the same sequence
-        for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
-             w++) {
-          for (int d = 0; d < input_dim; d++) {
-            if (w == 0) {
-              cot_seq(k, d) = weights(w, d) * cip_seq(k + w, d);
-            } else {
-              cot_seq(k, d) += weights(w, d) * cip_seq(k + w, d);
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class RowConvGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<LoDTensor>("X");
-    auto *filter = context.Input<Tensor>("Filter");
-    auto *d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto *dx = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto *d_filter = context.Output<Tensor>(framework::GradVarName("Filter"));
-
-    auto input_dim = x->dims()[1];  // 'x' is of size T x N
-    auto batch_indices = x->lod()[0];
-    size_t num_sequence = batch_indices.size() - 1;
-    auto future_context = filter->dims()[0];
-
-    if (d_filter) {
-      d_filter->mutable_data<T>(context.GetPlace());
-      auto dweights =
-          EigenMatrix<T>::From(*d_filter);  // Gradient of weight matrix
-      dweights.setZero();
-
-      for (size_t i = 0; i < num_sequence; i++) {  // For different sequences
-        int start = static_cast<int>(batch_indices[i]);
-        int end = static_cast<int>(batch_indices[i + 1]);
-
-        Tensor cur_input = x->Slice(start, end);  // Current input sequence
-        Tensor cur_doutput =
-            d_out->Slice(start, end);  // Current output grad sequence
-
-        auto cur_ip = EigenMatrix<T>::From(cur_input);
-        auto cur_dout = EigenMatrix<T>::From(cur_doutput);
-        int current_timesteps = end - start;
-
-        for (int k = 0; k < current_timesteps;
-             k++) {  // For different time steps in the same sequence
-          for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
-               w++) {
-            // For dweights (Updating the gradient of weight matrix)
-            for (int d = 0; d < input_dim; d++) {
-              dweights(w, d) += cur_ip(k + w, d) * cur_dout(k, d);
-            }
-          }
-        }
-      }
-    }
-
-    if (dx) {
-      dx->mutable_data<T>(context.GetPlace());
-      auto weights = EigenMatrix<T>::From(*filter);
-      for (size_t i = 0; i < num_sequence; i++) {  // For different sequences
-        int start = static_cast<int>(batch_indices[i]);
-        int end = static_cast<int>(batch_indices[i + 1]);
-
-        Tensor cur_doutput =
-            d_out->Slice(start, end);  // Current output grad sequence
-        Tensor cur_dinput =
-            dx->Slice(start, end);  // Current input grad sequence
-
-        auto cur_dout = EigenMatrix<T>::From(cur_doutput);
-        auto cur_dip = EigenMatrix<T>::From(cur_dinput);
-        cur_dip.setZero();
-        int current_timesteps = end - start;
-
-        for (int k = 0; k < current_timesteps;
-             k++) {  // For different time steps in the same sequence
-          for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
-               w++) {
-            // For dinput (Updating the gradient wrt input)
-            for (int d = 0; d < input_dim; d++) {
-              cur_dip(k + w, d) += weights(w, d) * cur_dout(k, d);
-            }
-          }
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(row_conv, ops::RowConvOp, ops::RowConvOpMaker, row_conv_grad,
-            ops::RowConvGradOp);
-REGISTER_OP_CPU_KERNEL(
-    row_conv, ops::RowConvKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    row_conv_grad,
-    ops::RowConvGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/row_conv_op.cu b/paddle/operators/row_conv_op.cu
deleted file mode 100644
index b3825212e1ac41b13a2f4cad2c128da39c5f6e71..0000000000000000000000000000000000000000
--- a/paddle/operators/row_conv_op.cu
+++ /dev/null
@@ -1,410 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/row_conv_op.h"
-#include "paddle/platform/cuda_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using framework::Tensor;
-
-namespace {
-
-inline int DivUp(int x, int y) { return (x + y - 1) / y; }
-
-// Forward prop (shared memory version, for small future_context)
-template <typename T>
-__global__ void RowConvForwardSharedMemory(const T *in, const T *wt,
-                                           int num_sequence, int input_dim,
-                                           int future_context,
-                                           const size_t *batch_indices,
-                                           T *out) {
-  int blx = blockDim.x;
-  int bly = blockDim.y;
-  int thx = threadIdx.x;
-  int thy = threadIdx.y;
-  int d = blockIdx.x * blx + thx;  // index along input dim
-
-  extern __shared__ T mem[];
-  T *sw = mem;
-
-  if (thy < future_context) {
-    sw[thy * blx + thx] =
-        (d < input_dim) ? wt[thy * input_dim + d] : static_cast<T>(0);
-  }
-  __syncthreads();
-
-  for (size_t i = 0; i < num_sequence; i++) {
-    int start = static_cast<int>(batch_indices[i]);
-    int end = static_cast<int>(batch_indices[i + 1]);
-    int current_timesteps = end - start;
-    for (int k = thy; k < current_timesteps; k += bly) {
-      T sum = 0;
-      for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
-           w++) {
-        sum += (d < input_dim)
-                   ? sw[w * blx + thx] * in[(start + k + w) * input_dim + d]
-                   : static_cast<T>(0);
-      }
-      if (d < input_dim) {
-        out[(start + k) * input_dim + d] = sum;
-      }
-    }
-  }
-}
-
-// Forward prop (naive version)
-template <typename T>
-__global__ void RowConvForward(const T *in, const T *wt, int num_sequence,
-                               int input_dim, int future_context,
-                               const size_t *batch_indices, T *out) {
-  int d = blockIdx.x * blockDim.x + threadIdx.x;  // index along input_dim
-  int bly = blockDim.y;
-  int thy = threadIdx.y;
-
-  if (d >= input_dim) return;
-
-  for (size_t i = 0; i < num_sequence; i++) {
-    int start = static_cast<int>(batch_indices[i]);
-    int end = static_cast<int>(batch_indices[i + 1]);
-    int current_timesteps = end - start;
-    for (int k = thy; k < current_timesteps; k += bly) {
-      T sum = 0;
-      for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
-           w++) {
-        sum += (wt[w * input_dim + d] * in[(start + k + w) * input_dim + d]);
-      }
-      out[(start + k) * input_dim + d] = sum;
-    }
-  }
-}
-
-// Compute input gradient (shared memory version, for small future_context)
-template <typename T>
-__global__ void RowConvGradInputSharedMemory(const T *dout, const T *wt,
-                                             int num_sequence, int input_dim,
-                                             int future_context,
-                                             const size_t *batch_indices,
-                                             T *din) {
-  int blx = blockDim.x;
-  int bly = blockDim.y;
-  int thx = threadIdx.x;
-  int thy = threadIdx.y;
-  int d = blockIdx.x * blx + thx;  // index along input dim
-
-  extern __shared__ T mem[];
-  T *sw = mem;
-  if (thy < future_context) {
-    sw[thy * blx + thx] =
-        (d < input_dim) ? wt[thy * input_dim + d] : static_cast<T>(0);
-  }
-  __syncthreads();
-
-  for (int i = 0; i < num_sequence; i++) {
-    int start = static_cast<int>(batch_indices[i]);
-    int end = static_cast<int>(batch_indices[i + 1]);
-    int current_timesteps = end - start;
-    for (int k = thy; k < current_timesteps; k += bly) {
-      T sum = 0;
-      for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) {
-        sum += (d < input_dim)
-                   ? (sw[w * blx + thx] * dout[(k + start - w) * input_dim + d])
-                   : static_cast<T>(0);
-      }
-      if (d < input_dim) {
-        din[(k + start) * input_dim + d] = sum;
-      }
-    }
-  }
-}
-
-// Compute input gradient (Naive version)
-template <typename T>
-__global__ void RowConvGradInput(const T *dout, const T *wt, int num_sequence,
-                                 int input_dim, int future_context,
-                                 const size_t *batch_indices, T *din) {
-  int d = blockIdx.x * blockDim.x + threadIdx.x;  // index along input_dim
-  int bly = blockDim.y;
-  int thy = threadIdx.y;
-
-  if (d >= input_dim) return;
-  for (int i = 0; i < num_sequence; i++) {
-    int start = static_cast<int>(batch_indices[i]);
-    int end = static_cast<int>(batch_indices[i + 1]);
-    int current_timesteps = end - start;
-    for (int k = thy; k < current_timesteps; k += bly) {
-      T sum = 0;
-      for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) {
-        sum += (wt[w * input_dim + d] * dout[(k + start - w) * input_dim + d]);
-      }
-      din[(k + start) * input_dim + d] = sum;
-    }
-  }
-}
-
-// Compute W gradient (small future_context version)
-template <typename T>
-__global__ void RowConvGradFilterImproved(const T *in, const T *dout,
-                                          int num_sequence, int input_dim,
-                                          int future_context, int block_x,
-                                          int block_y,
-                                          const size_t *batch_indices,
-                                          T *dfilter) {
-  int blx = blockDim.x;
-  int bly = blockDim.y;
-  int thx = threadIdx.x;
-  int thy = threadIdx.y;
-  int gx = blockIdx.x * blx;
-  int d = gx + thx;  // index along input dim
-
-  extern __shared__ T mem[];
-
-  int xdim_sh_in = block_y;
-  int xdim_sh_dout = block_y;
-  // int xdim_sh_dfilter = future_context;
-  int ydim_sh_in = block_x;
-  int ydim_sh_dout = block_x + future_context - 1;
-  int ydim_sh_dfilter = block_y;
-
-  T *sh_in = mem;
-  T *sh_dout = &mem[xdim_sh_in * ydim_sh_in];
-  T *sh_dfilter = &mem[xdim_sh_in * ydim_sh_in + xdim_sh_dout * ydim_sh_dout];
-
-  if (thy < future_context) {
-    sh_dfilter[thy * ydim_sh_dfilter + thx] = static_cast<T>(0);
-  }
-  __syncthreads();
-
-  for (int i = 0; i < num_sequence; i++) {
-    int start = static_cast<int>(batch_indices[i]);
-    int end = static_cast<int>(batch_indices[i + 1]);
-    int current_timesteps = end - start;
-    int scaled_cur_steps =
-        ((current_timesteps + block_x - 1) / block_x) * block_x;
-
-    for (int k = thy; k < scaled_cur_steps; k += block_x) {
-      int pos = start + k;
-      sh_in[thx * ydim_sh_in + thy] =
-          (d < input_dim && pos < end) ? in[pos * input_dim + d] : T(0);
-      sh_dout[thx * ydim_sh_dout + thy + future_context - 1] =
-          (d < input_dim && pos < end) ? dout[pos * input_dim + d] : T(0);
-      __syncthreads();
-
-      if (thy < future_context - 1) {
-        int pos_offset = pos - future_context + 1;
-        sh_dout[thx * ydim_sh_dout + thy] =
-            (d < input_dim && pos_offset >= start)
-                ? dout[pos_offset * input_dim + d]
-                : T(0);
-      }
-      __syncthreads();
-
-      for (int w = 0; w < future_context; w++) {
-        T val = sh_in[thy * ydim_sh_in + thx] *
-                sh_dout[thy * ydim_sh_dout + thx + future_context - 1 - w];
-        __syncthreads();
-
-        for (int offset = 16; offset > 0;
-             offset = offset / 2) {  // blockDim.x is 32.
-          val += __shfl_down(val, offset);
-        }
-        __syncthreads();
-
-        if (thx == 0) {
-          sh_dfilter[w * ydim_sh_dfilter + thy] += val;
-        }
-        __syncthreads();
-      }
-    }
-  }
-  for (int w = thy; (w < future_context) && (d < input_dim); w += bly) {
-    dfilter[w * input_dim + d] += sh_dfilter[w * ydim_sh_dfilter + thx];
-  }
-}
-
-// Compute weight(filter) gradient
-template <typename T>
-__global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence,
-                                  int input_dim, int future_context,
-                                  int block_x, int block_y,
-                                  const size_t *batch_indices, T *dfilter) {
-  int blx = blockDim.x;
-  int thx = threadIdx.x;
-  int thy = threadIdx.y;
-  int gx = blockIdx.x * blx;
-  int d = gx + thx;  // index along input dim
-  extern __shared__ T mem[];
-  T *sh_in = mem;
-  T *sh_dout = &mem[block_x * block_y];
-
-  for (int i = 0; i < num_sequence; i++) {
-    int start = static_cast<int>(batch_indices[i]);
-    int end = static_cast<int>(batch_indices[i + 1]);
-    int current_timesteps = end - start;
-    int scaled_cur_steps =
-        ((current_timesteps + block_x - 1) / block_x) * block_x;
-
-    for (int k = thy; k < scaled_cur_steps; k += block_x) {
-      int pos = start + k;
-      sh_in[thx * block_y + thy] =
-          (d < input_dim && pos < end) ? in[pos * input_dim + d] : 0.0;
-      __syncthreads();
-
-      for (int w = 0; w < future_context; w++) {
-        sh_dout[thx * block_y + thy] =
-            (d < input_dim && (k - w) >= 0 && (k - w) < current_timesteps)
-                ? dout[(pos - w) * input_dim + d]
-                : 0.0;
-        __syncthreads();
-
-        T val = sh_in[thy * block_y + thx] * sh_dout[thy * block_y + thx];
-        __syncthreads();
-
-        for (int offset = 16; offset > 0;
-             offset = offset / 2) {  // blockDim.x is 32.
-          val += __shfl_down(val, offset);
-        }
-        __syncthreads();
-
-        if (thx == 0 && (gx + thy) < input_dim) {
-          dfilter[w * input_dim + gx + thy] += val;
-        }
-      }
-    }
-  }
-}
-
-}  // namespace
-
-template <typename T>
-class RowConvKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *X = context.Input<LoDTensor>("X");
-    auto *Filter = context.Input<Tensor>("Filter");
-    auto *Out = context.Output<LoDTensor>("Out");
-
-    const T *in = X->data<T>();
-    const T *weight = Filter->data<T>();
-    T *out = Out->mutable_data<T>(context.GetPlace());
-
-    auto batch_indices = X->lod()[0];
-    int input_dim = X->dims()[1];
-    int num_sequence = batch_indices.size() - 1;
-    int future_context = Filter->dims()[0];
-    size_t *idx = batch_indices.cuda_data();
-    auto stream = context.cuda_device_context().stream();
-
-    if (future_context <= 32) {
-      dim3 block_dim = dim3(32, 32);
-      dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
-      int mem_per_block = (future_context * block_dim.x) * sizeof(T);
-      RowConvForwardSharedMemory<
-          T><<<grid_dim, block_dim, mem_per_block, stream>>>(
-          in, weight, num_sequence, input_dim, future_context, idx, out);
-    } else {
-      dim3 block_dim = dim3(32, 32);
-      dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
-      RowConvForward<T><<<grid_dim, block_dim, 0, stream>>>(
-          in, weight, num_sequence, input_dim, future_context, idx, out);
-    }
-  }
-};
-
-template <typename T>
-class RowConvGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *X = context.Input<LoDTensor>("X");
-    auto *Filter = context.Input<Tensor>("Filter");
-    auto *dOut = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    const T *in = X->data<T>();
-    const T *weights = Filter->data<T>();
-    const T *dout = dOut->data<T>();
-
-    Tensor *dX = context.Output<LoDTensor>(framework::GradVarName("X"));
-    Tensor *dFilter = context.Output<Tensor>(framework::GradVarName("Filter"));
-
-    auto batch_indices = X->lod()[0];
-    int input_dim = X->dims()[1];
-    int num_sequence = batch_indices.size() - 1;
-    int future_context = Filter->dims()[0];
-    size_t *idx = batch_indices.cuda_data();
-
-    auto &device_ctx = context.cuda_device_context();
-    math::SetConstant<platform::CUDADeviceContext, T> zero;
-
-    if (dFilter) {
-      T *dfilter = dFilter->mutable_data<T>(context.GetPlace());
-      zero(device_ctx, dFilter, static_cast<T>(0.0));
-
-      if (future_context <= 32) {
-        dim3 block_dim = dim3(32, 32);
-        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
-        int block_x = block_dim.x;
-        int block_y = block_dim.y;
-        int mem_per_block =
-            (block_y * block_x + block_y * (block_x + future_context - 1) +
-             future_context * block_y) *
-            sizeof(T);
-        RowConvGradFilterImproved<
-            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
-            in, dout, num_sequence, input_dim, future_context, block_x, block_y,
-            idx, dfilter);
-      } else {
-        dim3 block_dim = dim3(32, 32);
-        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
-        int block_x = block_dim.x;
-        int block_y = block_dim.y;
-        int mem_per_block =
-            (block_x * block_y * 2) * sizeof(T);  // For 2 arrays of size 32x32
-        RowConvGradFilter<
-            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
-            in, dout, num_sequence, input_dim, future_context, block_x, block_y,
-            idx, dfilter);
-      }
-    }
-
-    if (dX) {
-      T *din = dX->mutable_data<T>(context.GetPlace());
-      if (future_context <= 32) {
-        dim3 block_dim = dim3(32, 32);
-        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
-        int mem_per_block = (future_context * block_dim.x) * sizeof(T);
-        RowConvGradInputSharedMemory<
-            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
-            dout, weights, num_sequence, input_dim, future_context, idx, din);
-      } else {
-        dim3 block_dim = dim3(32, 32);
-        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
-        RowConvGradInput<T><<<grid_dim, block_dim, 0, device_ctx.stream()>>>(
-            dout, weights, num_sequence, input_dim, future_context, idx, din);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    row_conv, ops::RowConvKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    row_conv_grad,
-    ops::RowConvGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/row_conv_op.h b/paddle/operators/row_conv_op.h
deleted file mode 100644
index 10d435ab080851713ee08a491c43aad1549f6fbb..0000000000000000000000000000000000000000
--- a/paddle/operators/row_conv_op.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class RowConvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override;
-};
-
-template <typename DeviceContext, typename T>
-class RowConvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override;
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/save_combine_op.cc b/paddle/operators/save_combine_op.cc
deleted file mode 100644
index bffa2908bc42d73332f22fa3706d24ab49cd4b38..0000000000000000000000000000000000000000
--- a/paddle/operators/save_combine_op.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdint.h>
-#include <sys/stat.h>
-#include <fstream>
-#include <numeric>
-#include <sstream>
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-// TODO(sidgoyal78): These function are needed by other files (save_op), move
-// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
-constexpr char kSEP = '/';
-static bool FileExists(const std::string &filepath) {
-  struct stat buffer;
-  return (stat(filepath.c_str(), &buffer) == 0);
-}
-
-static std::string DirName(const std::string &filepath) {
-  auto pos = filepath.rfind(kSEP);
-  if (pos == std::string::npos) {
-    return "";
-  }
-  return filepath.substr(0, pos);
-}
-
-static void MkDir(const char *path) {
-  if (mkdir(path, 0755)) {
-    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
-  }
-}
-
-static void MkDirRecursively(const char *fullpath) {
-  if (*fullpath == '\0') return;  // empty string
-  if (FileExists(fullpath)) return;
-
-  MkDirRecursively(DirName(fullpath).c_str());
-  MkDir(fullpath);
-}
-
-class SaveCombineOp : public framework::OperatorBase {
- public:
-  SaveCombineOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    auto filename = Attr<std::string>("file_path");
-    auto overwrite = Attr<bool>("overwrite");
-
-    bool is_present = FileExists(filename);
-    if (is_present && !overwrite) {
-      PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
-                   filename, overwrite);
-    }
-
-    MkDirRecursively(DirName(filename).c_str());
-    std::ofstream fout(filename);
-    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
-                   filename);
-
-    auto inp_var_names = Inputs("X");
-    PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
-                      "The number of input variables should be greater than 0");
-
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    for (size_t i = 0; i < inp_var_names.size(); i++) {
-      auto *var = scope.FindVar(inp_var_names[i]);
-
-      PADDLE_ENFORCE(var != nullptr,
-                     "Cannot find variable %s for save_combine_op",
-                     inp_var_names[i]);
-      PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
-                     "SaveCombineOp only supports LoDTensor, %s has wrong type",
-                     inp_var_names[i]);
-
-      auto &tensor = var->Get<framework::LoDTensor>();
-      // Serialize tensor
-      framework::SerializeToStream(fout, tensor, dev_ctx);
-    }
-    fout.close();
-  }
-};
-
-class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SaveCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "X",
-        "(vector) Input LoDTensors that need to be saved together in a file.")
-        .AsDuplicable();
-    AddComment(R"DOC(
-SaveCombine operator
-
-This operator will serialize and write a list of input LoDTensor variables 
-to a file on disk.
-)DOC");
-    AddAttr<bool>("overwrite",
-                  "(boolean, default true)"
-                  "Overwrite the output file if it exists.")
-        .SetDefault(true);
-    AddAttr<std::string>(
-        "file_path",
-        "(string)"
-        "The \"file_path\" where the LoDTensor variables will be saved.")
-        .AddCustomChecker(
-            [](const std::string &path) { return !path.empty(); });
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(save_combine, ops::SaveCombineOp,
-                  ops::SaveCombineOpProtoMaker);
diff --git a/paddle/operators/save_load_combine_op_test.cc b/paddle/operators/save_load_combine_op_test.cc
deleted file mode 100644
index f3ddc4a6c55d72e4e444869a1ebcd7662c892317..0000000000000000000000000000000000000000
--- a/paddle/operators/save_load_combine_op_test.cc
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <string>
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/framework/op_registry.h"
-
-USE_NO_KERNEL_OP(save_combine);
-USE_NO_KERNEL_OP(load_combine);
-
-int* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
-                            std::string var_name,
-                            paddle::platform::CPUPlace& place,
-                            paddle::framework::Scope& scope,
-                            paddle::framework::LoD& expect_lod) {
-  auto var = scope.Var(var_name);
-  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
-  tensor->Resize({x, y});
-  expect_lod.resize(1);
-  for (size_t i = 0; i < lod_info.size(); i++) {
-    expect_lod[0].push_back(lod_info[i]);
-  }
-  tensor->set_lod(expect_lod);
-  int* expect = tensor->mutable_data<int>(place);
-  for (int64_t i = 0; i < tensor->numel(); ++i) {
-    expect[i] = static_cast<int>(i);
-  }
-  return expect;
-}
-
-paddle::framework::LoDTensor* GeneratePlaceholderBeforeLoad(
-    const std::string out_var_name, paddle::framework::Scope& scope) {
-  auto load_var = scope.Var(out_var_name);
-  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
-  return target;
-}
-
-int* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target,
-                                 paddle::framework::Scope& scope,
-                                 paddle::framework::LoD& actual_lod) {
-  int* actual = target->data<int>();
-  actual_lod = target->lod();
-  return actual;
-}
-
-void CheckValues(int* expect, int* actual, paddle::framework::LoD expect_lod,
-                 paddle::framework::LoD actual_lod, const int& numel) {
-  for (int64_t i = 0; i < numel; ++i) {
-    EXPECT_EQ(expect[i], actual[i]);
-  }
-  EXPECT_EQ(expect_lod.size(), actual_lod.size());
-  for (size_t i = 0; i < expect_lod.size(); ++i) {
-    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
-      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
-    }
-  }
-}
-
-// Here, we create 4 LoDTensors and use save_combine_op to first save these
-// in a single file. Then, we use load_combine_op to load these sequentially
-TEST(SaveLoadCombineOp, CPU) {
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace place;
-
-  std::vector<int> lod1 = {0, 1, 2, 3, 10};
-  int numel1 = 100;
-  paddle::framework::LoD expect_lod1;
-  int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place, scope,
-                                        expect_lod1);
-
-  std::vector<int> lod2 = {0, 2, 5, 10};
-  int numel2 = 200;
-  paddle::framework::LoD expect_lod2;
-  int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place, scope,
-                                        expect_lod2);
-
-  std::vector<int> lod3 = {0, 2, 3, 20};
-  int numel3 = 4000;
-  paddle::framework::LoD expect_lod3;
-  int* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", place,
-                                        scope, expect_lod3);
-
-  std::vector<int> lod4 = {0, 1, 20};
-  int numel4 = 1000;
-  paddle::framework::LoD expect_lod4;
-  int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place, scope,
-                                        expect_lod4);
-
-  // Set attributes
-  std::string filename = "check_tensor.ls";
-  paddle::framework::AttributeMap attrs;
-  attrs.insert({"file_path", std::string(filename)});
-
-  // Run the save_combine_op
-  auto save_combine_op = paddle::framework::OpRegistry::CreateOp(
-      "save_combine",
-      {{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs);
-  save_combine_op->Run(scope, place);
-
-  // Set up output vars
-  auto target1 = GeneratePlaceholderBeforeLoad("out_var1", scope);
-  auto target2 = GeneratePlaceholderBeforeLoad("out_var2", scope);
-  auto target3 = GeneratePlaceholderBeforeLoad("out_var3", scope);
-  auto target4 = GeneratePlaceholderBeforeLoad("out_var4", scope);
-
-  // Run the load_combine_op
-  auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
-      "load_combine", {},
-      {{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs);
-  load_combine_op->Run(scope, place);
-
-  paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
-  int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, actual_lod1);
-  int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, actual_lod2);
-  int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, actual_lod3);
-  int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, actual_lod4);
-
-  CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1);
-  CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2);
-  CheckValues(expect3, actual3, expect_lod3, actual_lod3, numel3);
-  CheckValues(expect4, actual4, expect_lod4, actual_lod4, numel4);
-}
-
-// Test with original SaveLoadTest
-TEST(SaveLoadTestWithCombineOp, CPU) {
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace place;
-
-  auto var = scope.Var("test_var");
-  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
-  tensor->Resize({3, 10});
-  paddle::framework::LoD expect_lod;
-  expect_lod.resize(1);
-  expect_lod[0].push_back(0);
-  expect_lod[0].push_back(1);
-  expect_lod[0].push_back(2);
-  expect_lod[0].push_back(3);
-
-  tensor->set_lod(expect_lod);
-  int* expect = tensor->mutable_data<int>(place);
-  for (int64_t i = 0; i < tensor->numel(); ++i) {
-    expect[i] = static_cast<int>(i);
-  }
-  paddle::framework::AttributeMap attrs;
-  attrs.insert({"file_path", std::string("check_t.save")});
-
-  auto save_op = paddle::framework::OpRegistry::CreateOp(
-      "save_combine", {{"X", {"test_var"}}}, {}, attrs);
-  save_op->Run(scope, place);
-
-  auto load_var = scope.Var("out_var");
-  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
-  auto load_op = paddle::framework::OpRegistry::CreateOp(
-      "load_combine", {}, {{"Out", {"out_var"}}}, attrs);
-  load_op->Run(scope, place);
-  int* actual = target->data<int>();
-  for (int64_t i = 0; i < tensor->numel(); ++i) {
-    EXPECT_EQ(expect[i], actual[i]);
-  }
-  auto& actual_lod = target->lod();
-  EXPECT_EQ(expect_lod.size(), actual_lod.size());
-  for (size_t i = 0; i < expect_lod.size(); ++i) {
-    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
-      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
-    }
-  }
-}
diff --git a/paddle/operators/save_load_op_test.cc b/paddle/operators/save_load_op_test.cc
deleted file mode 100644
index d829d5da174b73613da9dcfcd308a5b05e12bce9..0000000000000000000000000000000000000000
--- a/paddle/operators/save_load_op_test.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gtest/gtest.h"
-#include "paddle/framework/op_registry.h"
-
-USE_NO_KERNEL_OP(save);
-USE_NO_KERNEL_OP(load);
-
-TEST(SaveLoadOp, CPU) {
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace place;
-
-  auto var = scope.Var("test_var");
-  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
-  tensor->Resize({3, 10});
-  paddle::framework::LoD expect_lod;
-  expect_lod.resize(1);
-  expect_lod[0].push_back(0);
-  expect_lod[0].push_back(1);
-  expect_lod[0].push_back(2);
-  expect_lod[0].push_back(3);
-
-  tensor->set_lod(expect_lod);
-  int* expect = tensor->mutable_data<int>(place);
-  for (int64_t i = 0; i < tensor->numel(); ++i) {
-    expect[i] = static_cast<int>(i);
-  }
-  paddle::framework::AttributeMap attrs;
-  attrs.insert({"file_path", std::string("tensor.save")});
-
-  auto save_op = paddle::framework::OpRegistry::CreateOp(
-      "save", {{"X", {"test_var"}}}, {}, attrs);
-  save_op->Run(scope, place);
-
-  auto load_var = scope.Var("out_var");
-  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
-  auto load_op = paddle::framework::OpRegistry::CreateOp(
-      "load", {}, {{"Out", {"out_var"}}}, attrs);
-  load_op->Run(scope, place);
-  int* actual = target->data<int>();
-  for (int64_t i = 0; i < tensor->numel(); ++i) {
-    EXPECT_EQ(expect[i], actual[i]);
-  }
-  auto& actual_lod = target->lod();
-  EXPECT_EQ(expect_lod.size(), actual_lod.size());
-  for (size_t i = 0; i < expect_lod.size(); ++i) {
-    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
-      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
-    }
-  }
-}
diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc
deleted file mode 100644
index 4b1cbe88836e340c94f797806243a6768410ed3d..0000000000000000000000000000000000000000
--- a/paddle/operators/save_op.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdint.h>
-#include <sys/stat.h>
-#include <fstream>
-#include <numeric>
-
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-// TODO(yuyang18): If the functions below are needed by other files, move them
-// to paddle::filesystem namespace.
-constexpr char kSEP = '/';
-static bool FileExists(const std::string &filepath) {
-  struct stat buffer;
-  return (stat(filepath.c_str(), &buffer) == 0);
-}
-
-static std::string DirName(const std::string &filepath) {
-  auto pos = filepath.rfind(kSEP);
-  if (pos == std::string::npos) {
-    return "";
-  }
-  return filepath.substr(0, pos);
-}
-
-static void MkDir(const char *path) {
-  if (mkdir(path, 0755)) {
-    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
-  }
-}
-
-static void MkDirRecursively(const char *fullpath) {
-  if (*fullpath == '\0') return;  // empty string
-  if (FileExists(fullpath)) return;
-
-  MkDirRecursively(DirName(fullpath).c_str());
-  MkDir(fullpath);
-}
-
-class SaveOp : public framework::OperatorBase {
- public:
-  SaveOp(const std::string &type, const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    auto filename = Attr<std::string>("file_path");
-    auto overwrite = Attr<bool>("overwrite");
-
-    if (FileExists(filename) && !overwrite) {
-      PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
-                   filename, overwrite);
-    }
-
-    MkDirRecursively(DirName(filename).c_str());
-
-    // FIXME(yuyang18): We save variable to local file now, but we should change
-    // it to save an output stream.
-    std::ofstream fout(filename);
-    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
-                   filename);
-
-    auto iname = Input("X");
-    auto *var = scope.FindVar(iname);
-    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op",
-                   iname);
-
-    PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
-                   "SaveOp only support LoDTensor, %s has wrong type", iname);
-
-    auto &tensor = var->Get<framework::LoDTensor>();
-
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    framework::SerializeToStream(fout, tensor, dev_ctx);
-  }
-};
-
-class SaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SaveOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor ) Input tensor to be saved");
-    AddComment(R"DOC(
-Save operator
-
-This operator will serialize and write a tensor variable to file on disk.
-)DOC");
-    AddAttr<bool>("overwrite",
-                  "(boolean, default true)"
-                  "Overwrite the output file if exist")
-        .SetDefault(true);
-    AddAttr<std::string>("file_path",
-                         "(string)"
-                         "The \"file_path\" where the variable will be saved.")
-        .AddCustomChecker(
-            [](const std::string &path) { return !path.empty(); });
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker);
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
deleted file mode 100644
index c0e614743a894dece2cdc395d0b28df7e86e921d..0000000000000000000000000000000000000000
--- a/paddle/operators/scale_op.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/scale_op.h"
-#include "paddle/operators/net_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ScaleOp : public framework::OperatorWithKernel {
- public:
-  ScaleOp(const std::string &type, const framework::VariableNameMap &inputs,
-          const framework::VariableNameMap &outputs,
-          const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ScaleOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ScaleOp should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-template <typename AttrType>
-class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) Input tensor of scale operator.");
-    AddOutput("Out", "(Tensor) Output tensor of scale operator.");
-    AddComment(R"DOC(
-Scale operator
-
-$$Out = scale*X$$
-)DOC");
-    AddAttr<AttrType>("scale",
-                      "(float, default 1.0)"
-                      "The scaling factor of the scale operator.")
-        .SetDefault(1.0);
-  }
-};
-
-class ScaleGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("scale");
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetAttr("scale", GetAttr("scale"));
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
-                  ops::ScaleGradMaker);
-REGISTER_OP_CPU_KERNEL(
-    scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/scale_op.cu b/paddle/operators/scale_op.cu
deleted file mode 100644
index 7202c0de707ff0b0b3ad966d9d1d3a7c0a89e880..0000000000000000000000000000000000000000
--- a/paddle/operators/scale_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/scale_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    scale,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>);
diff --git a/paddle/operators/scale_op.h b/paddle/operators/scale_op.h
deleted file mode 100644
index 395268c2eee40c187f5d211317ca8b28d35a71e0..0000000000000000000000000000000000000000
--- a/paddle/operators/scale_op.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class ScaleKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    tensor->mutable_data<T>(in->place());
-
-    auto scale = static_cast<T>(context.Attr<float>("scale"));
-
-    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& dev =
-        *context.template device_context<DeviceContext>().eigen_device();
-    eigen_out.device(dev) = scale * eigen_in;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/scatter.cu.h b/paddle/operators/scatter.cu.h
deleted file mode 100644
index 55555300fc3219c0651583d8540b47189c8d3f13..0000000000000000000000000000000000000000
--- a/paddle/operators/scatter.cu.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-template <typename T>
-__global__ void ScatterCUDAKernel(const T* params, const int* indices,
-                                  T* output, size_t index_size,
-                                  size_t slice_size) {
-  CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
-    int indices_i = i / slice_size;
-    int slice_i = i - indices_i * slice_size;  // offset inside the slice
-    int scatter_i = indices[indices_i];
-    int out_i = scatter_i * slice_size + slice_i;
-    *(output + out_i) = *(params + i);
-  }
-}
-
-/**
- * A thin wrapper on gpu tensor
- * Return a new updated tensor from source tensor, scatter-assigned according to
- * index
- * input[src]: type-T source Tensor
- * input[index]: type-int index Tensor (1-D)
- * return: output tensor
- */
-template <typename T>
-void GPUScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
-                      const Tensor& index, Tensor* output) {
-  // PADDLE_ENFORCE(platform::is_gpu_place(place));
-  // check index of shape 1-D
-  PADDLE_ENFORCE(index.dims().size() == 1);
-  int index_size = index.dims()[0];
-
-  auto src_dims = src.dims();
-  framework::DDim output_dims(src_dims);
-  output_dims[0] = index_size;
-
-  // slice size
-  int slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-
-  const T* p_src = src.data<T>();
-  const int* p_index = index.data<int>();
-  T* p_output = output->data<T>();
-
-  int block = 512;
-  int n = slice_size * index_size;
-  int grid = (n + block - 1) / block;
-
-  ScatterCUDAKernel<T><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      p_src, p_index, p_output, index_size, slice_size);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/scatter.h b/paddle/operators/scatter.h
deleted file mode 100644
index c1fb844ebd2ff7ca7dbdb8e8ac3c1fff4c0c6607..0000000000000000000000000000000000000000
--- a/paddle/operators/scatter.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cstring>
-
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-/**
- * Return a updated tensor from source tensor, scattered according to index:
- * dst[i] = src[index[i]]
- * input[src]: type-T source Tensor
- * input[index]: type-int index Tensor (1-D)
- * return: output tensor
- */
-template <typename T>
-void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
-                   const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
-  // check index of shape 1-D
-  PADDLE_ENFORCE(index.dims().size() == 1);
-  int index_size = index.dims()[0];
-
-  auto src_dims = src.dims();
-  auto dst_dims = output->dims();
-
-  const T* p_src = src.data<T>();
-  const int* p_index = index.data<int>();
-  T* p_output = output->data<T>();
-
-  // check src shape and dst shape should match
-  for (int i = 1; i < src_dims.size(); i++)
-    PADDLE_ENFORCE(src_dims[i] == dst_dims[i]);
-
-  // slice size
-  size_t slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-
-  const size_t slice_bytes = slice_size * sizeof(T);
-
-  for (int i = 0; i < index_size; ++i) {
-    int index_ = p_index[i];
-    memcpy(p_output + index_ * slice_size, p_src + i * slice_size, slice_bytes);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
deleted file mode 100644
index b65334890633f54e70179bfa8fe5463901f7947e..0000000000000000000000000000000000000000
--- a/paddle/operators/scatter_op.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/scatter_op.h"
-#include "paddle/framework/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-class ScatterOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Ref"),
-                   "Input(Ref) of ScatterOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Index"),
-                   "Input(Index) of ScatterOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Updates"),
-                   "Input(Updates) of ScatterOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ScatterOp should not be null.");
-
-    auto updates_dims = ctx->GetInputDim("Updates");
-    auto ref_dims = ctx->GetInputDim("Ref");
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Index").size(), 1,
-                      "Update Index should be 1-D.");
-    PADDLE_ENFORCE_EQ(ref_dims.size(), updates_dims.size(),
-                      "Reference and Updates should have the same shape size");
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0],
-                      ctx->GetInputDim("Index")[0],
-                      "Updates and Index should have same batch-size.");
-    framework::DDim data_dim(updates_dims);
-    for (int i = 1; i < data_dim.size(); ++i) {
-      PADDLE_ENFORCE_EQ(data_dim[i], updates_dims[i]);
-    }
-    ctx->SetOutputDim("Out", ref_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
-        ctx.device_context());
-  }
-};
-
-class ScatterGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("Updates"),
-                      ctx->GetInputDim("Updates"));
-    ctx->SetOutputDim(framework::GradVarName("Ref"), ctx->GetInputDim("Ref"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
-        ctx.device_context());
-  }
-};
-
-class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Ref", "The source input of scatter op");
-    AddInput("Index",
-             "The index input of scatter op where Ref will be updated");
-    AddInput("Updates", "The updated value of updates op");
-    AddOutput("Out", "The output of add op");
-    AddComment(R"DOC(
-Scatter Operator.
-
-This operator obtains output by updating the input on selected indices on the first axis:
-
-$$
-Out = Ref \\
-Out[Index] = Ref[Index] + Updates
-$$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad,
-            ops::ScatterGradOp);
-REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>);
-REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>);
diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu
deleted file mode 100644
index 0c198d225890882ab6697d3a8b3d17e034c06cc4..0000000000000000000000000000000000000000
--- a/paddle/operators/scatter_op.cu
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gather.cu.h"
-#include "paddle/operators/gather_op.h"
-#include "scatter.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ScatterOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto *Ref = ctx.Input<Tensor>("Ref");
-    auto *Index = ctx.Input<Tensor>("Index");
-    auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *Out = ctx.Output<Tensor>("Out");
-
-    Out->ShareDataWith(*Ref);
-
-    GPUScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
-  }
-};
-
-template <typename T>
-class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
-    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Index = ctx.Input<Tensor>("Index");
-    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    // In place gradient: dRef = dO
-    dRef->ShareDataWith(*dOut);
-    dUpdates->mutable_data<T>(ctx.GetPlace());
-    // Gradient by Gather: dUpdates = dO[Index]
-    GPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(scatter, ops::ScatterOpCUDAKernel<float>);
-REGISTER_OP_CUDA_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel<float>);
diff --git a/paddle/operators/scatter_op.h b/paddle/operators/scatter_op.h
deleted file mode 100644
index 1a4f6f99bfe36cd0de2d4f2af3f6054571d8f188..0000000000000000000000000000000000000000
--- a/paddle/operators/scatter_op.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "gather.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class ScatterOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "This kernel only runs on CPU.");
-    auto *Ref = ctx.Input<Tensor>("Ref");
-    auto *Index = ctx.Input<Tensor>("Index");
-    auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *Out = ctx.Output<Tensor>("Out");
-
-    // In place output: Out = Ref, Out[Index] += Updates
-    Out->ShareDataWith(*Ref);
-    // Apply ScatterUpdate: Out[index] += Updates[:]
-    ScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
-  }
-};
-
-template <typename T>
-class ScatterGradientOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "This kernel only runs on CPU.");
-    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
-    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Index = ctx.Input<Tensor>("Index");
-    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    // In place gradient: dRef = dO
-    dRef->ShareDataWith(*dOut);
-    dUpdates->mutable_data<T>(ctx.GetPlace());
-    // Gradient by Gather: dUpdates += dO[Index]
-    CPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/scatter_test.cc b/paddle/operators/scatter_test.cc
deleted file mode 100644
index 00dbdacbfef7af826790472acc6caa285c259e0e..0000000000000000000000000000000000000000
--- a/paddle/operators/scatter_test.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/scatter.h"
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/platform/place.h"
-
-#include <gtest/gtest.h>
-#include <iostream>
-#include <string>
-
-TEST(scatter, ScatterUpdate) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::operators;
-
-  Tensor* src = new Tensor();
-  Tensor* index = new Tensor();
-  Tensor* output = new Tensor();
-
-  float* p_src = nullptr;
-  int* p_index = nullptr;
-  p_src = src->mutable_data<float>(make_ddim({1, 4}), CPUPlace());
-  p_index = index->mutable_data<int>(make_ddim({1}), CPUPlace());
-
-  for (size_t i = 0; i < 4; ++i) p_src[i] = float(i);
-  p_index[0] = 1;
-
-  float* p_output = output->mutable_data<float>(make_ddim({4, 4}), CPUPlace());
-
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  ScatterAssign<float>(ctx, *src, *index, output);
-
-  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], float(0));
-  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
-  for (size_t i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], float(i - 4));
-  for (size_t i = 4; i < 8; ++i)
-    EXPECT_EQ(output->data<float>()[i], float(i - 4));
-  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], float(0));
-  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
-
-  delete src;
-  delete index;
-  delete output;
-}
diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc
deleted file mode 100644
index ee0f268b0e4dfa23bf878d71404d47553183a977..0000000000000000000000000000000000000000
--- a/paddle/operators/send_op.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <ostream>
-
-#include "paddle/framework/data_type.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_registry.h"
-
-#include <future>
-#include "paddle/operators/detail/grpc_client.h"
-
-namespace paddle {
-namespace operators {
-
-class SendOp : public framework::OperatorBase {
- public:
-  SendOp(const std::string& type, const framework::VariableNameMap& inputs,
-         const framework::VariableNameMap& outputs,
-         const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void Run(const framework::Scope& scope,
-           const platform::Place& place) const override {
-    auto ins = Inputs("X");
-    auto outs = Outputs("Out");
-    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-    std::vector<std::string> endpoints =
-        Attr<std::vector<std::string>>("endpoints");
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-
-    auto client_var_name = Output("RPCClient");
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
-                            "Can not find variable '%s' in the scope.",
-                            client_var_name);
-    auto* client_var = scope.FindVar(client_var_name);
-    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
-
-    for (size_t i = 0; i < ins.size(); i++) {
-      VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-      rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
-    }
-    PADDLE_ENFORCE(rpc_client->Wait());
-
-    for (auto& ep : endpoints) {
-      VLOG(3) << "batch barrier, ep: " << ep;
-      rpc_client->AsyncSendBatchBarrier(ep);
-    }
-    PADDLE_ENFORCE(rpc_client->Wait());
-
-    if (outs.size() > 0) {
-      for (size_t i = 0; i < outs.size(); i++) {
-        VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
-        rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
-      }
-      PADDLE_ENFORCE(rpc_client->Wait());
-    }
-  }
-};
-
-class SendOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SendOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable();
-    AddOutput("Out", "(Tensor) Output tensor to be received from server")
-        .AsDuplicable();
-    AddOutput("RPCClient",
-              "(RPCClient) The RPC client object which is"
-              "initialized at most once.");
-    AddComment(R"DOC(
-Send operator
-
-This operator will send tensor to recv_op at the parameter server.
-)DOC");
-    // TODO(typhoonzero): remove this attr generate de-duplicated vector from
-    // epmap when initializing.
-    AddAttr<std::vector<std::string>>("endpoints",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints to send variables to.")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>("epmap",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints in the order of input "
-                                      "variables for mapping")
-        .SetDefault({});
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(send, ops::SendOp, ops::SendOpMaker);
diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc
deleted file mode 100644
index 31527a906d56da54d2571910de627757d708a996..0000000000000000000000000000000000000000
--- a/paddle/operators/send_recv_op_test.cc
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>
-
-#include "gtest/gtest.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/program_desc.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/selected_rows_functor.h"
-#include "paddle/string/printf.h"
-
-USE_NO_KERNEL_OP(send);
-USE_NO_KERNEL_OP(listen_and_serv);
-USE_OP(sum);
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-namespace m = paddle::operators::math;
-
-// global for simplicity.
-std::unique_ptr<f::OperatorBase> listen_and_serv_op;
-
-void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
-  p::CPUDeviceContext ctx(place);
-  for (int i = 0; i < 2; ++i) {
-    auto var_name = paddle::string::Sprintf("x%d", i);
-    auto var = scope.Var(var_name);
-    auto tensor = var->GetMutable<f::LoDTensor>();
-    tensor->Resize({10, 10});
-    float *expect = tensor->mutable_data<float>(place);
-    for (int64_t i = 0; i < tensor->numel(); ++i) {
-      expect[i] = static_cast<float>(i);
-    }
-  }
-
-  auto out_var = scope.Var("Out");
-  auto out_tensor = out_var->GetMutable<f::LoDTensor>();
-  out_tensor->Resize({10, 10});
-  out_tensor->mutable_data<float>(place);  // allocate
-}
-
-void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
-  p::CPUDeviceContext ctx(place);
-  int64_t height = 10;
-  int64_t row_numel = 10;
-  m::SetConstant<p::CPUDeviceContext, float> set_one;
-  // init x0
-  std::vector<int64_t> rows0{0, 4, 7};
-  auto x0_var = scope.Var("x0");
-  auto x0 = x0_var->GetMutable<f::SelectedRows>();
-  x0->set_rows(rows0);
-  x0->set_height(height);
-  auto x0_value = x0->mutable_value();
-  x0_value->mutable_data<float>(
-      f::make_ddim({static_cast<int64_t>(rows0.size()), row_numel}), place);
-  set_one(ctx, x0_value, 1.0);
-
-  // init x1
-  std::vector<int64_t> rows1{2, 9};
-  auto x1_var = scope.Var("x1");
-  auto x1 = x1_var->GetMutable<f::SelectedRows>();
-  x1->set_rows(rows1);
-  x1->set_height(height);
-  auto x1_value = x1->mutable_value();
-  x1_value->mutable_data<float>(
-      f::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), place);
-  set_one(ctx, x1_value, 1.0);
-
-  auto out_var = scope.Var("Out");
-  auto out = out_var->GetMutable<f::SelectedRows>();
-  auto out_value = out->mutable_value();
-  out->set_height(height);
-  out_value->mutable_data<float>(f::make_ddim({5, 10}), place);
-}
-
-void AddOp(const std::string &type, const f::VariableNameMap &inputs,
-           const f::VariableNameMap &outputs, f::AttributeMap attrs,
-           f::BlockDesc *block) {
-  // insert output
-  for (auto kv : outputs) {
-    for (auto v : kv.second) {
-      auto var = block->Var(v);
-      var->SetDataType(f::proto::DataType::FP32);
-    }
-  }
-
-  // insert op
-  auto op = block->AppendOp();
-  op->SetType(type);
-  for (auto &kv : inputs) {
-    op->SetInput(kv.first, kv.second);
-  }
-  for (auto &kv : outputs) {
-    op->SetOutput(kv.first, kv.second);
-  }
-  op->SetAttrMap(attrs);
-}
-
-void StartServerNet(bool is_sparse) {
-  f::Scope scope;
-  p::CPUPlace place;
-  if (is_sparse) {
-    InitSelectedRowsInScope(scope, place);
-  } else {
-    InitTensorsInScope(scope, place);
-  }
-
-  // sub program run in listen_and_serv_op, for simple test we use sum
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  // X for server side tensors, RX for received tensers, must be of same shape.
-  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, block);
-
-  f::AttributeMap attrs;
-  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
-  attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
-  attrs.insert({"GradList", std::vector<std::string>({"x1"})});
-  attrs.insert({"OptimizeBlock", block});
-  listen_and_serv_op =
-      f::OpRegistry::CreateOp("listen_and_serv", {}, {}, attrs);
-  listen_and_serv_op->Run(scope, place);
-}
-
-TEST(SendRecvOp, CPUDense) {
-  std::thread server_thread(StartServerNet, false);
-  sleep(10);  // wait server to start
-  // local net
-  f::Scope scope;
-  p::CPUPlace place;
-  InitTensorsInScope(scope, place);
-
-  f::AttributeMap attrs;
-  attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
-  attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
-  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
-                                         {{"Out", {"Out"}}}, attrs);
-  send_op->Run(scope, place);
-
-  auto in_var = scope.Var("x1");
-  auto tensor = in_var->GetMutable<f::LoDTensor>();
-  float *expected = tensor->data<float>();
-  auto out_var = scope.Var("Out");
-  auto target = out_var->GetMutable<f::LoDTensor>();
-  // x1 * 2 == x0
-  EXPECT_NE(target->memory_size(), size_t(0));
-  float *actual = target->data<float>();
-  for (int64_t i = 0; i < target->numel(); ++i) {
-    EXPECT_EQ(expected[i] * 2, actual[i]);
-  }
-  listen_and_serv_op->Stop();
-  server_thread.join();
-  listen_and_serv_op.reset(nullptr);
-}
-
-TEST(SendRecvOp, CPUSparse) {
-  std::thread server_thread(StartServerNet, true);
-  sleep(3);  // wait server to start
-  // local net
-  f::Scope scope;
-  p::CPUPlace place;
-  p::CPUDeviceContext ctx(place);
-  InitSelectedRowsInScope(scope, place);
-  f::AttributeMap attrs;
-  attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
-  attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
-  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
-                                         {{"Out", {"Out"}}}, attrs);
-  send_op->Run(scope, place);
-
-  auto x0 = scope.Var("x0")->GetMutable<f::SelectedRows>();
-  auto x1 = scope.Var("x1")->GetMutable<f::SelectedRows>();
-  auto out = scope.Var("Out")->GetMutable<f::SelectedRows>();
-  auto actual = out->mutable_value();
-
-  std::unique_ptr<f::SelectedRows> expect{new f::SelectedRows()};
-  auto expect_value = expect->mutable_value();
-  expect_value->mutable_data<float>(f::make_ddim({5, 10}), place);
-
-  m::SelectedRowsAdd<p::CPUDeviceContext, float> add_functor;
-  add_functor(ctx, *x0, *x1, expect.get());
-
-  EXPECT_EQ(actual->numel(), expect_value->numel());
-  EXPECT_EQ(out->rows().size(), x0->rows().size() + x1->rows().size());
-
-  for (int64_t i = 0; i < expect_value->numel(); ++i) {
-    EXPECT_EQ(expect_value->mutable_data<float>(place)[i],
-              actual->mutable_data<float>(place)[i]);
-  }
-  listen_and_serv_op->Stop();
-  server_thread.join();
-  listen_and_serv_op.reset();
-}
diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc
deleted file mode 100644
index 2f0aad2003e48952ca26ca27573bc45386a4e585..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_concat_op.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/sequence_concat_op.h"
-
-namespace paddle {
-namespace operators {
-
-class SequenceConcatOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInputs("X"),
-                   "Inputs(X) of SequenceConcatOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SequenceConcatOp should not be null.");
-    const size_t level = static_cast<size_t>(ctx->Attrs().Get<int>("level"));
-    const size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
-    PADDLE_ENFORCE(level == 0UL || level == 1UL,
-                   "The sequence_concat operator only accepts sequence "
-                   "or a nested sequence as its input.");
-    auto ins_dims = ctx->GetInputsDim("X");
-    framework::DDim out_dims = ins_dims[0];
-    const size_t n = ins_dims.size();
-    for (size_t i = 1; i < n; ++i) {
-      out_dims[axis] += ins_dims[i][axis];
-    }
-    ctx->SetOutputDim("Out", out_dims);
-  }
-};
-
-class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SequenceConcatOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(LodTensorArray) Input is a vector of LoDTensor, "
-             "each of which is a variable-length sequence or nested sequence.")
-        .AsDuplicable();
-    AddOutput("Out",
-              "(LoDTensor), Variable-length output of "
-              "sequence_concat Op.");
-    AddAttr<int>("axis",
-                 "(int, default 0) "
-                 "The axis along which the inputs will be joined. "
-                 "If axis is 0, the inputs will be joined with LoD index.")
-        .SetDefault(0);
-    AddAttr<int>("level",
-                 "(int, default 0) "
-                 "The level at which the inputs will be joined. "
-                 "If the level is 0, the inputs will be joined at the nested "
-                 "sequence level. "
-                 "If the level is 1, the inputs will be joined at the "
-                 "sequence level. "
-                 "The level should be less than the level number of inputs.")
-        .SetDefault(0);
-    AddComment(R"DOC(
-The sequence_concat operator concatenates multiple LoDTensors.
-It only supports sequence (LoD Tensor with level number is 1)
-or a nested sequence (LoD tensor with level number is 2) as its input.
-- Case1:
-  If the axis is other than 0(here, axis is 1 and level is 1),
-  each input should have the same LoD information and the LoD
-  information of the output keeps the same as the input.
-
-  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
-  LoD(x1) = {{0,2,4}, {0,1,2,3,4}}; Dims(x1) = (4,4,4)
-  LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4)
-
-- Case2:
-  If the axis is 0(here, leve is 0), the inputs are concatenated along
-  time steps, the LoD information of the output need to re-compute.
-  The LoD information of level-1 should be same.
-
-  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
-  LoD(x1) = {{0,2,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4)
-  LoD(Out) = {{0,2,4}, {0,2,5,8,11}}; Dims(Out) = (11,3,4)
-
-- Case3:
-  If the axis is 0(here, level is 1).
-
-  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
-  LoD(x1) = {{0,3,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4)
-  LoD(Out) = {{0,5,8}, {0,1,2,3,5,7,8,9,11}}; Dims(Out) = (11,3,4)
-
-- Case4:
-  If the LoD number is 1, axis is 0, level is 0
-
-  LoD(x0) = {{0,1,2,3,4}}; Dims(x0) = (4,3,4)
-  LoD(x1) = {{0,1,3,5,7}}; Dims(x1) = (7,3,4)
-  LoD(Out) = {{0,2,5,8,11}}; Dims(Out) = (11,3,4)
-
-NOTE: The levels of all the inputs should be the same.
-    )DOC");
-  }
-};
-
-class SequenceConcatGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "The gradient of Out should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
-                   "The gradient of X should not be null.");
-    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_EX(sequence_concat, ops::SequenceConcatOp,
-               ops::SequenceConcatOpMaker, sequence_concat_grad,
-               ops::SequenceConcatGradOp, false);
-REGISTER_OP_CPU_KERNEL(
-    sequence_concat,
-    ops::SequenceConcatOpKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_concat_grad,
-    ops::SequenceConcatGradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_concat_op.cu.cc b/paddle/operators/sequence_concat_op.cu.cc
deleted file mode 100644
index 144bdb5af635b0cb75bcd1f654700041186dae46..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_concat_op.cu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/sequence_concat_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_concat,
-    ops::SequenceConcatOpKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(sequence_concat_grad,
-                        ops::SequenceConcatGradOpKernel<
-                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_concat_op.h b/paddle/operators/sequence_concat_op.h
deleted file mode 100644
index 8445224f46aba6110280783c9080ed4691266b8b..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_concat_op.h
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/strided_memcpy.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using LoD = framework::LoD;
-
-template <typename T>
-LoD ConcatLoD(const std::vector<const T*> ins, const size_t level) {
-  auto out_lod = ins[0]->lod();
-  auto numLevels = ins[0]->NumLevels();
-  const size_t n = ins.size();
-  const size_t level_idx = ins[0]->NumLevels() - 1 - level;
-  for (size_t i = 1; i < n; ++i) {
-    for (size_t j = 0; j < ins[i]->lod()[level_idx].size(); ++j) {
-      out_lod[level_idx][j] += ins[i]->lod()[level_idx][j];
-    }
-  }
-
-  for (size_t i = level_idx; i < numLevels - 1; ++i) {
-    size_t lod_len = 1;
-    for (size_t j = 0; j < n; ++j) {
-      lod_len += ins[j]->lod()[i + 1].size() - 1;
-    }
-    out_lod[i + 1].clear();
-    out_lod[i + 1].resize(lod_len);
-
-    size_t idx = 1;
-    for (size_t j = 0; j < ins[0]->lod()[i].size() - 1; ++j) {
-      for (size_t k = 0; k < n; ++k) {
-        for (size_t m = ins[k]->lod()[i][j]; m < ins[k]->lod()[i][j + 1]; ++m) {
-          out_lod[i + 1][idx] = out_lod[i + 1][idx - 1] +
-                                ins[k]->lod()[i + 1][m + 1] -
-                                ins[k]->lod()[i + 1][m];
-          idx++;
-        }
-      }
-    }
-  }
-
-  return out_lod;
-}
-
-template <typename DeviceContext, typename T>
-class SequenceConcatOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<LoDTensor>("X");
-    auto* out = ctx.Output<LoDTensor>("Out");
-    const size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
-    const size_t level = static_cast<size_t>(ctx.Attr<int>("level"));
-    const size_t n = ins.size();
-
-    for (size_t i = 1; i < n; ++i) {
-      PADDLE_ENFORCE_EQ(ins[0]->NumLevels(), ins[i]->NumLevels(),
-                        "The levels of all the input LoDTensors "
-                        "should be the same.");
-      PADDLE_ENFORCE_EQ(ins[0]->dims().size(), ins[i]->dims().size(),
-                        "The dimension size of all the input LoDTensors "
-                        "should be the same.");
-
-      const size_t dims_size = ins[i]->dims().size();
-      for (size_t j = 0; j < dims_size; ++j) {
-        if (j == axis) continue;
-        PADDLE_ENFORCE_EQ(ins[0]->dims()[j], ins[i]->dims()[j],
-                          "Except for the dimension of the specified "
-                          "axis along which all the inputs are concatenated, "
-                          "dimensions of all the other axises of the input "
-                          "LoDTensors should be the same.");
-      }
-    }
-    PADDLE_ENFORCE_GT(ins[0]->NumLevels(), level,
-                      "The levels of all the input LoDTensors "
-                      "should be greater than the specify level");
-
-    out->mutable_data<T>(ctx.GetPlace());
-    auto out_lod = ins[0]->lod();
-    if (axis == 0) {
-      out_lod = ConcatLoD<LoDTensor>(ins, level);
-    }
-    out->set_lod(out_lod);
-
-    const size_t level_idx = out_lod.size() - level - 1;
-    auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx];
-    for (size_t i = 0; i < out_lod_level.size() - 1; ++i) {
-      Tensor out_t = out->Slice(static_cast<int>(out_lod_level[i]),
-                                static_cast<int>(out_lod_level[i + 1]));
-      auto out_stride = framework::stride(out_t.dims());
-      size_t offset = 0;
-      for (size_t j = 0; j < n; ++j) {
-        auto in_lod_level = framework::ToAbsOffset(ins[j]->lod())[level_idx];
-        auto in_stride = framework::stride(ins[j]->dims());
-        Tensor in_t = ins[j]->Slice(static_cast<int>(in_lod_level[i]),
-                                    static_cast<int>(in_lod_level[i + 1]));
-        size_t axis_dim = in_t.dims()[axis];
-        StridedMemcpy<T>(ctx.device_context(), in_t.data<T>(), in_stride,
-                         in_t.dims(), out_stride, out_t.data<T>() + offset);
-        offset += axis_dim * in_stride[axis];
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SequenceConcatGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<framework::LoDTensor>("X");
-    auto* out_grad =
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-    auto x_grads =
-        ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
-    size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
-    size_t level = static_cast<size_t>(ctx.Attr<int>("level"));
-    const size_t n = x_grads.size();
-
-    // Set Grad(X) LoD as X
-    for (size_t i = 0; i < n; i++) {
-      x_grads[i]->set_lod(ins[i]->lod());
-      x_grads[i]->mutable_data<T>(ctx.GetPlace());
-    }
-    auto out_lod = ins[0]->lod();
-    if (axis == 0UL) {
-      out_lod = ConcatLoD<LoDTensor>(ins, level);
-    }
-    const size_t level_idx = out_lod.size() - level - 1;
-    auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx];
-
-    for (size_t i = 0; i < out_lod_level.size() - 1; ++i) {
-      Tensor out_grad_t =
-          out_grad->Slice(static_cast<int>(out_lod_level[i]),
-                          static_cast<int>(out_lod_level[i + 1]));
-      auto out_grad_stride = framework::stride(out_grad_t.dims());
-      size_t offset = 0;
-
-      for (size_t j = 0; j < n; ++j) {
-        auto x_grad_lod_level =
-            framework::ToAbsOffset(x_grads[j]->lod())[level_idx];
-        auto x_grad_stride = framework::stride(x_grads[j]->dims());
-        Tensor x_grad_t =
-            x_grads[j]->Slice(static_cast<int>(x_grad_lod_level[i]),
-                              static_cast<int>(x_grad_lod_level[i + 1]));
-        size_t axis_dim = x_grad_t.dims()[axis];
-        StridedMemcpy<T>(ctx.device_context(), out_grad_t.data<T>() + offset,
-                         out_grad_stride, out_grad_t.dims(), x_grad_stride,
-                         x_grad_t.data<T>());
-        offset += axis_dim * out_grad_stride[axis];
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/sequence_conv_op.cc b/paddle/operators/sequence_conv_op.cc
deleted file mode 100644
index c5b7c81bd7c6e1110aa9e2ced629bea5d88832d1..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_conv_op.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/sequence_conv_op.h"
-
-namespace paddle {
-namespace operators {
-
-class SequenceConvOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequenceConvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                   "Input(Filter) of SequenceConvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SequenceConvOp should not be null.");
-
-    int context_length = ctx->Attrs().Get<int>("contextLength");
-    int context_start = ctx->Attrs().Get<int>("contextStart");
-
-    auto in_dims = ctx->GetInputDim("X");
-    auto filter_dims = ctx->GetInputDim("Filter");
-    PADDLE_ENFORCE(ctx->Attrs().Get<int>("contextStride") == 1,
-                   "Currently, SequenceConvOp only supports contextStride=1.");
-    PADDLE_ENFORCE(in_dims.size() == 2 && filter_dims.size() == 2,
-                   "Input(X, Filter) should be 2-D tensor.");
-    PADDLE_ENFORCE(filter_dims[0] == context_length * in_dims[1],
-                   "Filter's height should be context_length * "
-                   "input_hidden_size .");
-
-    if (ctx->Attrs().Get<bool>("paddingTrainable")) {
-      PADDLE_ENFORCE(
-          ctx->HasInput("PaddingData"),
-          "Input(PaddingData) of SequenceConvOp should not be null.");
-      framework::DDim padding_dim = ctx->GetInputDim("PaddingData");
-      int up_pad = std::max(0, -context_start);
-      int down_pad = std::max(0, context_start + context_length - 1);
-      int total_pad = up_pad + down_pad;
-      int input_width = static_cast<int>(in_dims[1]);
-
-      if (context_start == 0 && context_length == 1) {
-        PADDLE_THROW(
-            "If context_start is 0 and context_length is 1, paddingTrainable "
-            "should be false.");
-      }
-      PADDLE_ENFORCE(padding_dim.size() == 2,
-                     "Input(PaddingData) should be 2-D tensor.");
-      PADDLE_ENFORCE(
-          padding_dim[0] == total_pad && padding_dim[1] == input_width,
-          "Input(PaddingData)'s shape is not consistent with 'context_start' "
-          "and 'context_length'.");
-    }
-
-    in_dims[1] = filter_dims[1];
-    ctx->SetOutputDim("Out", in_dims);
-    ctx->ShareLoD("X", "Out");
-  }
-};
-
-class SequenceConvGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Gradient of output(Out) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("X"), "The input(X) should not be null.");
-
-    if (ctx->Attrs().Get<bool>("paddingTrainable") &&
-        ctx->HasOutput(framework::GradVarName("PaddingData"))) {
-      ctx->SetOutputDim(framework::GradVarName("PaddingData"),
-                        ctx->GetInputDim("PaddingData"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-      ctx->ShareLoD("X", framework::GradVarName("X"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("Filter"))) {
-      ctx->SetOutputDim(framework::GradVarName("Filter"),
-                        ctx->GetInputDim("Filter"));
-    }
-  }
-};
-
-class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SequenceConvOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "X",
-        "(LoDTensor) the input(X) is a LodTensor, which supports "
-        "variable-time length input sequence. The underlying tensor in "
-        "this LoDTensor is a matrix with shape (T, N), where T is the "
-        "total time steps in this mini-batch and N is the input_hidden_size.");
-    AddInput("PaddingData",
-             "(Tensor, optional) the input(PaddingData) is an optional "
-             "parameter, and it is learnable. "
-             "This is a tensor with shape (P, N), where P is the "
-             "top_pad + bottom_pad, N is the input_hidden_size. In order to "
-             "ensure the equal length of sequence before and after "
-             "convolution, it is necessary to fill the top and bottom of each "
-             "sequence according to context_length, context_stride and "
-             "context_start")
-        .AsDispensable();
-    AddInput(
-        "Filter",
-        "(Tensor) the input(Filter) is an learnable parameter."
-        "This is a tensor with shape (K, M), where K is the "
-        "context_length * input_hidden_size, M is the output feature size.");
-    AddOutput(
-        "Out",
-        "(LoDTensor) the output(Out) is a LodTensor, which support "
-        "variable-time length output sequence. The underlying tensor in "
-        "this LoDTensor is a matrix with shape (T, M), where, T is the "
-        "total time steps in this mini-batch, M is the output feature size.");
-
-    AddAttr<bool>("paddingTrainable",
-                  "(bool, default:false) the padding data of SequenceConvOp "
-                  "is trainable or not.")
-        .SetDefault(false);
-    AddAttr<int>("contextLength",
-                 "(int) the contextLength of SequenceConvOp is the "
-                 "height of the convolution kernel.")
-        .GreaterThan(0);
-    AddAttr<int>("contextStart",
-                 "(int, default:0) the contextStart of SequenceConvOp "
-                 "represents the beginning of the convolution of the number of "
-                 "rows of sequence, which can be negative. The negative number "
-                 "means to pad contextStart time-steps of zeros or learnable "
-                 "parameters at the beginning of each instance. The positive "
-                 "number means to skip contextStart time-steps of each "
-                 "instance.")
-        .SetDefault(0);
-    AddAttr<int>("contextStride",
-                 "(int, default:1) the contextStride of SequenceConvOp "
-                 "represents the stride length of convolution kernel. "
-                 "Currently, SequenceConvOp only supports"
-                 "contextStride=1.")
-        .SetDefault(1)
-        .GreaterThan(0);
-
-    AddComment(R"DOC(
-Sequence Conv Operator.
-
-SequenceConvOp performs convolution operation on features of contextLength
-time-steps of each instance. The convolution operation calculates the output
-based on the input, filter, strides and paddings parameters.
-The size of each dimension of the parameters is checked during infer-shape.
-In order to ensure the equal length of sequence before and after convolution,
-it is necessary to fill the top and bottom of each sequence based on
-context_length, context_stride and context_start.
-
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
-            sequence_conv_grad, ops::SequenceConvGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    sequence_conv,
-    ops::SequenceConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceConvKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_conv_grad,
-    ops::SequenceConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceConvGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/sequence_conv_op.cu.cc b/paddle/operators/sequence_conv_op.cu.cc
deleted file mode 100644
index 0b8f2c695564f19cf71ecc56a60e707c3703af36..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_conv_op.cu.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/sequence_conv_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_conv,
-    ops::SequenceConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceConvKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    sequence_conv_grad,
-    ops::SequenceConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceConvGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/sequence_conv_op.h b/paddle/operators/sequence_conv_op.h
deleted file mode 100644
index bb584b7bfa5fb8f6eb0a452468d24ca034be6f1b..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_conv_op.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/context_project.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename DeviceContext, typename T>
-class SequenceConvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    auto filter = *context.Input<Tensor>("Filter");
-
-    out->mutable_data<T>(context.GetPlace());
-    context.ShareLoD("X", "Out");
-
-    int context_start = context.Attr<int>("contextStart");
-    int context_length = context.Attr<int>("contextLength");
-    int context_stride = context.Attr<int>("contextStride");
-    bool padding_trainable = context.Attr<bool>("paddingTrainable");
-
-    PADDLE_ENFORCE_EQ(in->lod().size(), 1UL,
-                      "Only support one level sequence now.");
-
-    const Tensor* padding_data = nullptr;
-    if (padding_trainable) {
-      padding_data = context.Input<Tensor>("PaddingData");
-    }
-
-    int up_pad = std::max(0, -context_start);
-    int down_pad = std::max(0, context_start + context_length - 1);
-    int sequence_width = static_cast<int>(in->dims()[1]);
-
-    framework::DDim col_shape = {in->dims()[0],
-                                 context_length * sequence_width};
-    Tensor col;
-    col.mutable_data<T>(col_shape, context.GetPlace());
-    // Because if padding_trainable is false, padding data should be zeros.
-    math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    set_zero(dev_ctx, &col, static_cast<T>(0));
-
-    math::ContextProjectFunctor<DeviceContext, T> seq_project_functor;
-
-    seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable,
-                        context_start, context_length, context_stride, up_pad,
-                        down_pad, &col);
-
-    math::matmul<DeviceContext, T>(dev_ctx, col, false, filter, false,
-                                   static_cast<T>(1.0), out,
-                                   static_cast<T>(0.0));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SequenceConvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* filter_g = context.Output<Tensor>(framework::GradVarName("Filter"));
-    auto* padding_data_g =
-        context.Output<Tensor>(framework::GradVarName("PaddingData"));
-    auto* in = context.Input<LoDTensor>("X");
-    auto* filter = context.Input<Tensor>("Filter");
-
-    int context_start = context.Attr<int>("contextStart");
-    int context_length = context.Attr<int>("contextLength");
-    int context_stride = context.Attr<int>("contextStride");
-    bool padding_trainable = context.Attr<bool>("paddingTrainable");
-
-    PADDLE_ENFORCE_EQ(in->lod().size(), 1UL,
-                      "Only support one level sequence now.");
-    auto lod_g_level_0 = in->lod()[0];
-
-    int up_pad = std::max(0, -context_start);
-    int down_pad = std::max(0, context_start + context_length - 1);
-    int sequence_width = static_cast<int>(in->dims()[1]);
-
-    math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    // use col_shape in the im2col calculation
-    framework::DDim col_shape = {in->dims()[0],
-                                 sequence_width * context_length};
-    Tensor col;
-
-    if (in_g || filter_g || (padding_trainable && padding_data_g)) {
-      col.mutable_data<T>(col_shape, context.GetPlace());
-      // Because if padding_trainable is false, padding data should be zeros.
-      set_zero(dev_ctx, &col, static_cast<T>(0));
-      math::matmul<DeviceContext, T>(dev_ctx, *out_g, false, *filter, true,
-                                     T(1.0), &col, T(1.0));
-    }
-    math::ContextProjectFunctor<DeviceContext, T> seq_project_functor;
-    math::ContextProjectGradFunctor<DeviceContext, T> seq_project_grad_functor;
-
-    if (in_g) {
-      in_g->mutable_data<T>(context.GetPlace());
-      in_g->set_lod(in->lod());
-      set_zero(dev_ctx, in_g, static_cast<T>(0));
-
-      seq_project_grad_functor(dev_ctx, *in_g, padding_trainable, context_start,
-                               context_length, context_stride, up_pad, down_pad,
-                               false, true, padding_data_g, &col);
-    }
-
-    if (padding_trainable && padding_data_g) {
-      padding_data_g->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, padding_data_g, static_cast<T>(0));
-
-      LoDTensor* input = const_cast<LoDTensor*>(in);
-      seq_project_grad_functor(
-          dev_ctx, *input, padding_trainable, context_start, context_length,
-          context_stride, up_pad, down_pad, true, false, padding_data_g, &col);
-    }
-
-    if (filter_g) {
-      filter_g->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, filter_g, static_cast<T>(0));
-
-      Tensor filter_grad = *filter_g;
-      LoDTensor out_grad = *out_g;
-
-      const Tensor* padding_data = nullptr;
-      if (padding_trainable) {
-        padding_data = context.Input<Tensor>("PaddingData");
-      }
-
-      seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable,
-                          context_start, context_length, context_stride, up_pad,
-                          down_pad, &col);
-
-      math::matmul<DeviceContext, T>(dev_ctx, col, true, out_grad, false,
-                                     T(1.0), &filter_grad, T(1.0));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/sequence_erase_op.cc b/paddle/operators/sequence_erase_op.cc
deleted file mode 100644
index aa0c00aa6f7854ee5e34aef78970971b78df6514..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_erase_op.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/sequence_erase_op.h"
-
-namespace paddle {
-namespace operators {
-
-class SequenceEraseOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequenceEraseOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SequenceEraseOp should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE(x_dims.size() == 2 && x_dims[1] == 1,
-                   "Input(X) of SequenceEraseOp should be a 2-D LoDTensor "
-                   "with the 2nd dimension equal to 1.");
-    ctx->SetOutputDim("Out", x_dims);
-  }
-};
-
-class SequenceEraseOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SequenceEraseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(2-D LoDTensor with the 2nd dim. equal to 1) "
-             "Input LoDTensor of SequenceEraseOp.");
-    AddOutput("Out",
-              "(2-D LoDTensor with the 2nd dim. equal to 1) "
-              "Output LoDTensor of SequenceEraseOp.");
-    AddAttr<std::vector<int>>("tokens",
-                              "(vector<int>) Tokens need to be erased from "
-                              "input sequences.");
-    AddComment(R"DOC(
-Sequence Erase Operator.
-
-Sequence erase operator erases tokens specified by Attr(tokens) from the input 
-sequences Input(X), and outputs the remaining data and modifies the LoD 
-information at the same time. For example, given a 2-D LoDTensor
-
-    X = [[2, 2, 6, 1, 3, 9, 6, 1, 0, 1]]^T
-
-with lod = [[0, 3, 6, 10]], there are three sequences in the input:
-   
-     X1 = [[2, 2, 6]]^T, X2 = [[1, 3, 9]]^T and X3 = [[6, 1, 0, 1]]^T.
-
-If the tokens to be erased are Attr(tokens) = [2, 3, 5], after the erasing 
-operation, the three sequences become
-
-    X1' = [[6]]^T, X2' = [[1, 9]]^T and X3' = [[6, 1, 0, 1]]^T.
-
-Hence the LoDTensor Output(Out) should be
-
-    Out = [[6, 1, 9, 6, 1, 0, 1]]^T,
-
-with lod = [[0, 1, 3, 7]].
-
-An example usage for this operator is to remove the special tokens when 
-computing the edit distance between two strings, such as blank, start token, 
-and end token.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(sequence_erase, ops::SequenceEraseOp,
-                             ops::SequenceEraseOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    sequence_erase,
-    ops::SequenceEraseKernel<paddle::platform::CPUDeviceContext, int32_t>,
-    ops::SequenceEraseKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/sequence_erase_op.cu b/paddle/operators/sequence_erase_op.cu
deleted file mode 100644
index a5311f15f0c607c880a6f12c0bef10b2dd8c8a79..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_erase_op.cu
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include "paddle/operators/sequence_erase_op.h"
-#include "paddle/platform/cuda_helper.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T>
-__global__ void LabelErasedIdx(const T* in_dat, const int64_t in_len,
-                               const int* tokens, const size_t tokens_len,
-                               size_t* num_erased) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < in_len) {
-    for (size_t i = 0; i < tokens_len; ++i) {
-      if (in_dat[index] == tokens[i]) {
-        num_erased[index + 1] = 1;
-        break;
-      }
-    }
-  }
-}
-
-__global__ void GetOutLod(const size_t* num_erased, const size_t* in_lod,
-                          const size_t lod_len, size_t* out_lod0) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < lod_len) {
-    out_lod0[index] = in_lod[index] - num_erased[in_lod[index]];
-  }
-}
-
-template <typename T>
-__global__ void SetOutput(const T* in_dat, const int64_t in_len,
-                          const size_t* num_erased, T* out_dat) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < in_len) {
-    if (num_erased[index] == num_erased[index + 1]) {
-      out_dat[index - num_erased[index]] = in_dat[index];
-    }
-  }
-}
-
-template <typename T>
-class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<LoDTensor>("X");
-    auto* out = ctx.Output<LoDTensor>("Out");
-
-    auto lod = in->lod();
-    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
-    PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(),
-                      "The actual size mismatches with the LoD information.");
-    auto tokens = ctx.Attr<std::vector<int>>("tokens");
-    auto in_len = in->numel();
-    auto in_dat = in->data<T>();
-    // Copy tokens to GPU
-    thrust::device_vector<int> dev_tokens(tokens.begin(), tokens.end());
-    int* dev_tokens_ptr = thrust::raw_pointer_cast(dev_tokens.data());
-
-    // Count number of elements to be erased
-    thrust::device_vector<size_t> num_erased(in_len + 1, 0);
-    size_t* num_erased_ptr = thrust::raw_pointer_cast(num_erased.data());
-    auto stream = ctx.cuda_device_context().stream();
-    LabelErasedIdx<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
-                     PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        in_dat, in_len, dev_tokens_ptr, tokens.size(), num_erased_ptr);
-    thrust::inclusive_scan(num_erased.begin() + 1, num_erased.end(),
-                           num_erased.begin() + 1);
-
-    // Copy LoD to GPU
-    auto lod0 = lod[0];
-    auto lod_len = lod0.size();
-    thrust::device_vector<size_t> dev_in_lod = lod0;
-    size_t* dev_in_lod_ptr = thrust::raw_pointer_cast(dev_in_lod.data());
-
-    // Calc output LoD
-    thrust::device_vector<size_t> dev_out_lod(lod_len);
-    size_t* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data());
-    GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
-                PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr);
-    // Set LoD for output
-    std::vector<size_t> out_lod0(dev_out_lod.begin(), dev_out_lod.end());
-    framework::LoD out_lod;
-    out_lod.push_back(out_lod0);
-    out->set_lod(out_lod);
-
-    // Set output
-    out->Resize({static_cast<int64_t>(out_lod0.back()), 1});
-    auto out_dat = out->mutable_data<T>(ctx.GetPlace());
-    SetOutput<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
-                PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_dat, in_len,
-                                                      num_erased_ptr, out_dat);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(sequence_erase,
-                        paddle::operators::SequenceEraseOpCUDAKernel<int32_t>,
-                        paddle::operators::SequenceEraseOpCUDAKernel<int64_t>);
diff --git a/paddle/operators/sequence_erase_op.h b/paddle/operators/sequence_erase_op.h
deleted file mode 100644
index cb2d7be009dcbe0138818457249e95fbdd27fc0a..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_erase_op.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SequenceEraseKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::LoDTensor>("X");
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
-
-    auto lod = in->lod();
-    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
-    PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(),
-                      "The actual size mismatches with the LoD information.");
-    auto tokens = ctx.Attr<std::vector<int>>("tokens");
-    auto in_len = in->numel();
-    auto in_dat = in->data<T>();
-    auto lod0 = lod[0];
-
-    std::vector<size_t> num_erased(in_len + 1, 0);
-    std::vector<size_t> out_lod0(1, 0);
-    for (size_t i = 0; i < lod0.size() - 1; ++i) {
-      size_t num_out = 0;
-      for (auto j = lod0[i] + 1; j <= lod0[i + 1]; ++j) {
-        num_erased[j] = num_erased[j - 1];
-        if (std::find(tokens.begin(), tokens.end(), in_dat[j - 1]) !=
-            tokens.end()) {
-          num_erased[j] += 1;
-        } else {
-          num_out += 1;
-        }
-      }
-      out_lod0.push_back(out_lod0.back() + num_out);
-    }
-
-    auto out_len = in_len - num_erased[in_len];
-    out->Resize({static_cast<int64_t>(out_len), 1});
-    auto out_dat = out->mutable_data<T>(ctx.GetPlace());
-
-    for (int64_t i = 0; i < in_len; ++i) {
-      if (num_erased[i] == num_erased[i + 1]) {
-        out_dat[i - num_erased[i]] = in_dat[i];
-      }
-    }
-    framework::LoD out_lod;
-    out_lod.push_back(out_lod0);
-    out->set_lod(out_lod);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/sequence_expand_op.cc b/paddle/operators/sequence_expand_op.cc
deleted file mode 100644
index d34dbd35b6df2dac275fbe2c41f99b8549217d5b..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_expand_op.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/sequence_expand_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class SequenceExpandOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"));
-    PADDLE_ENFORCE(ctx->HasOutput("Out"));
-    PADDLE_ENFORCE(ctx->HasInput("Y"));
-    framework::DDim out_dim;
-    out_dim = ctx->GetInputDim("Y");
-    ctx->ShareLoD("Y", "Out");
-    ctx->SetOutputDim("Out", out_dim);
-  }
-};
-
-class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SequenceExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(Tensor or LoDTensor) The input(X) of this operator can be a "
-             "LoDTensor or a base Tensor.");
-    AddInput("Y",
-             "(LoDTensor)The reference input(Y) of sequence_expand op."
-             "It must be a LoDTensor with k-level(k>0)."
-             "The input(X) will be expanded according to LOD of input(Y)."
-             "The element numbers of last level in input(Y) "
-             "must be equal to dims[0] of input(X).");
-    AddOutput("Out",
-              "(LodTensor)The output of sequence_expand op."
-              "The lod of output will be as same as input(Y)'s lod.");
-    AddComment(R"DOC(
-Sequence Expand Operator.
-
-This operator expands input(X) according to LOD of input(Y).
-Following are cases to better explain how this works:
-Case 1:
-
-Given a 2-level LoDTensor input(X)
-    X.lod = [[0,       2, 3],
-             [0, 1,    3, 4]]
-    X.data = [a, b, c, d]
-    X.dims = [4, 1]
-and input(Y)
-    Y.lod = [[0,    2,    4],
-             [0, 3, 6, 7, 8]]
-with condition len(Y.lod[-1]) -1 == X.dims[0]
-then we get 2-level LoDTensor
-    Out.lod = [[0,                2,    4],
-               [0,       3,       6, 7, 8]]
-    Out.data = [a, a, a, b, b, b, c, d]
-    Out.dims = [8, 1]
-
-Case 2:
-
-Given a common Tensor input(X)
-    X.data = [a, b, c]
-    X.dims = [3, 1]
-and input(Y)
-    Y.lod = [[0, 2, 3, 6]]
-with condition len(Y.lod[-1]) -1 == X.dims[0]
-then we get 1-level LoDTensor
-    Out.lod = [[0,    2, 3,      6]]
-    Out.data = [a, a, b, c, c, c]
-    Out.dims = [6, 1]
-
-Case 3:
-
-Given a common Tensor input(X)
-    X.data = [[a, b], [c, d], [e, f]]
-    X.dims = [3, 2]
-and input(Y)
-    Y.lod = [[0, 2, 3, 6]]
-with condition len(Y.lod[-1]) -1 == X.dims[0]
-then we get 1-level LoDTensor
-    Out.lod = [[0,           2,     3,                     6]]
-    Out.data = [[a,b], [a,b] [c,d], [e, f], [e, f], [e, f]]
-    Out.dims = [6, 2]
-
-Case 4:
-
-Given 2-level a LoDTensor input(X)
-    X.lod = [[0,       2, 3],
-             [0, 1,    3, 4]]
-    X.data = [a, b, c, d]
-    X.dims = [4, 1]
-and input(Y)
-    Y.lod = [[0,    2,    4],
-             [0, 3, 6, 6, 8]]
-with condition len(Y.lod[-1]) -1 == X.dims[0]
-then we get 2-level LoDTensor
-    Out.lod = [[0,                2,    4],
-               [0,       3,       6, 6, 8]]
-    Out.data = [a, a, a, b, b, b, d, d]
-    Out.dims = [8, 1]
-
-
-)DOC");
-  }
-};
-
-class SequenceExpandOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"));
-    PADDLE_ENFORCE(ctx->HasInput("Out"));
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "The input(Out@GRAD) should not be null");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(sequence_expand, ops::SequenceExpandOp, ops::SequenceExpandOpMaker,
-            sequence_expand_grad, ops::SequenceExpandOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    sequence_expand,
-    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_expand_grad,
-    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_expand_op.cu b/paddle/operators/sequence_expand_op.cu
deleted file mode 100644
index 0b9638b2ce60f73c95e1ccbcfb16cef7b5351073..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_expand_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/sequence_expand_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_expand,
-    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    sequence_expand_grad,
-    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_expand_op.h b/paddle/operators/sequence_expand_op.h
deleted file mode 100644
index 6021526eee8e0a1f58885f6de38b14048787a828..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_expand_op.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/memory/memcpy.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-
-template <typename DeviceContext, typename T>
-class SequenceExpandKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    const T* x_data = x->data<T>();
-    auto x_dims = x->dims();
-    auto* y = context.Input<LoDTensor>("Y");
-    PADDLE_ENFORCE(!y->lod().empty(), "y should have lod");
-    PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims[0]),
-                      y->lod().back().size() - 1,
-                      "The size of last lod level in Input(Y)"
-                      "must be equal to dims[0] of Input(X).");
-    out->set_lod(y->lod());
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-    size_t element_len = framework::product(x_dims) / x_dims[0];
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    auto out_starts = out->lod().back();
-
-    for (size_t i = 0; i < out_starts.size() - 1; i++) {
-      int scale = out_starts[i + 1] - out_starts[i];
-      Eigen::TensorMap<
-          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-          x_t(x_data, 1, element_len);
-      Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-          out_t(out_data, scale, element_len);
-      Eigen::array<int, 2> cast({{scale, 1}});
-      out_t.device(*place) = x_t.broadcast(cast);
-      x_data += element_len;
-      out_data += element_len * scale;
-    }
-  }
-};
-
-/*
- *Given Grad(Out)
- *
- *    Grad(Out).lod = [[0,                            2],
- *                     [0,              3,            6]]
- *    Grad(Out).data = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
- * Then
- *    Grad(X).data = [(0.1 + 0.2 + 0.3), (0.4 + 0.5 + 0.6)]
- *                 = [0.6, 1.5]
- *    Grad(X).lod = Input(X).lod
- *
- * */
-template <typename DeviceContext, typename T>
-class SequenceExpandGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* x = context.Input<LoDTensor>("X");
-    auto* out = context.Input<LoDTensor>("Out");
-    auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto out_last_level = out->lod().back();
-    d_x->set_lod(x->lod());
-    const T* d_out_data = d_out->data<T>();
-    T* d_x_data = d_x->mutable_data<T>(context.GetPlace());
-    size_t element_len = d_out->numel() / d_out->dims()[0];
-    for (size_t i = 0; i < out_last_level.size() - 1; ++i) {
-      size_t repeat = out_last_level[i + 1] - out_last_level[i];
-      Eigen::TensorMap<
-          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-      d_out_t(d_out_data, static_cast<int>(repeat), element_len);
-      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>
-      d_x_t(d_x_data, static_cast<int>(element_len));
-      auto place =
-          context.template device_context<DeviceContext>().eigen_device();
-      d_x_t.device(*place) = d_out_t.sum(Eigen::array<int, 1>({{0}}));
-      d_out_data += (repeat * element_len);
-      d_x_data += element_len;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc
deleted file mode 100644
index 549d9620ef20e2be7d030f41f1e7567cb1922f3b..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_pool_op.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/sequence_pool_op.h"
-
-namespace paddle {
-namespace operators {
-
-class SequencePoolOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequencePoolOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SequencePoolOp should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    if (ctx->Attrs().Get<std::string>("pooltype") == "MAX") {
-      PADDLE_ENFORCE(ctx->HasOutput("MaxIndex"),
-                     "Output(MaxIndex) of SequencePoolOp should not be null.");
-      ctx->SetOutputDim("MaxIndex", ctx->GetInputDim("X"));
-    }
-  }
-};
-
-class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SequencePoolOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp");
-    AddOutput("Out",
-              "(Tensor) The output of SequencePoolOp does not contain LoD "
-              "infomation.");
-    AddOutput("MaxIndex",
-              "(Tensor<int>) This tensor is used for the sequence max-pooling "
-              "to record the max indexes.")
-        .AsIntermediate();
-    AddAttr<std::string>(
-        "pooltype",
-        "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.")
-        .SetDefault("AVERAGE")
-        .InEnum({"AVERAGE", "SUM", "SQRT", "LAST", "FIRST", "MAX"});
-    AddComment(R"DOC(
-Sequence Pool Operator.
-
-The SequencePoolOp pools features of all time-steps of each instance.
-It supports six pooling types:
-1. AVERAGE: $$Out[i] = \frac{\sum_i X_i}{N}$$
-2. SUM:     $$Out[i] = \sum_jX_{ij}$$
-3. SQRT:    $$Out[i] = \frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}$$
-4. LAST:    Out[i] = last instance in i-th sequence X[i]
-5. FIRST:   Out[i] = first instance in i-th sequence X[i]
-6. MAX:     $$Out[i] = max(X_i)$$
-
-The following example explains how this works:
-For a mini-batch of 3 variable-length sentences,
-containing 2, 3, and 2 time-steps:
-
-Assume X is a [7,M,N] LoDTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2.
-Besides, for the sake of simplicity, we assume M=1 and N=1,
-and the value of X = [[1, 3], [2, 4, 6], [5, 1]].
-
-Thus, Out is a [3,1,1] Tensor without LoD infomation.
-And for different pooltype, the value of Out is as follows:
-
-- AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
-- SUM: [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
-- SQRT: [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
-           6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
-- MAX: [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
-- LAST: [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
-- FIRST: [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
-
-    )DOC");
-  }
-};
-
-class SequencePoolGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Gradient of Out should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("X"), "The input X should not be null.");
-    auto og_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(og_dims.size(), x_dims.size(),
-                      "The rank of output grad must equal to Input(X).");
-    for (int64_t i = 1; i < og_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(og_dims[i], x_dims[i], "The dimension mismatch.");
-    }
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->ShareLoD("X", framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
-  }
-};
-
-class SequencePoolGradOpMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op_desc_ptr = new framework::OpDesc();
-    op_desc_ptr->SetType("sequence_pool_grad");
-    op_desc_ptr->SetInput("X", Input("X"));
-    if (boost::get<std::string>(GetAttr("pooltype")) == "MAX") {
-      op_desc_ptr->SetInput("MaxIndex", Output("MaxIndex"));
-    }
-    op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op_desc_ptr->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker,
-                  ops::SequencePoolGradOpMaker);
-REGISTER_OPERATOR(sequence_pool_grad, ops::SequencePoolGradOp);
-REGISTER_OP_CPU_KERNEL(
-    sequence_pool,
-    ops::SequencePoolKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_pool_grad,
-    ops::SequencePoolGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_pool_op.cu b/paddle/operators/sequence_pool_op.cu
deleted file mode 100644
index 265f695935236236f98c2dd2062072756e9c8b14..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_pool_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-
-#include "paddle/operators/sequence_pool_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_pool,
-    ops::SequencePoolKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    sequence_pool_grad,
-    ops::SequencePoolGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h
deleted file mode 100644
index 7519aa1d7208b9832f7a3d3afbc59a2eb4e8e13a..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_pool_op.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/sequence_pooling.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class SequencePoolKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<Tensor>("Out");
-    std::string pooltype = context.Attr<std::string>("pooltype");
-
-    auto dims = in->dims();
-    auto lod = in->lod();
-    int64_t w = in->numel() / dims[0];
-
-    // InferShape by lod
-    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
-    PADDLE_ENFORCE_GE(
-        dims[0],
-        /*batch size = */ static_cast<int64_t>(lod[0].size() - 1),
-        "The first dimension of Input(X) must be large than batch size.");
-    dims[0] = lod[0].size() - 1;
-    out->Resize({dims});
-
-    auto lod_level_0 = lod[0];
-
-    out->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    if (pooltype == "MAX") {
-      math::MaxSeqPoolFunctor<DeviceContext, T> max_pool;
-      auto* index = context.Output<Tensor>("MaxIndex");
-      index->Resize({dims});
-      index->mutable_data<int>(context.GetPlace());
-      max_pool(dev_ctx, *in, out, index);
-      return;
-    }
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-      Tensor in_t = in->Slice(static_cast<int>(lod_level_0[i]),
-                              static_cast<int>(lod_level_0[i + 1]));
-      Tensor out_t = out->Slice(i, i + 1);
-      int64_t h = static_cast<int64_t>(lod_level_0[i + 1] - lod_level_0[i]);
-      auto in_e = EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
-      auto out_e = EigenVector<T>::Flatten(out_t);
-
-      if (pooltype == "AVERAGE") {
-        out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
-      } else if (pooltype == "SUM") {
-        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}}));
-      } else if (pooltype == "SQRT") {
-        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
-                              std::sqrt(static_cast<T>(h));
-      } else if (pooltype == "LAST") {
-        out_e.device(place) = in_e.chip(h - 1, 0);
-      } else if (pooltype == "FIRST") {
-        out_e.device(place) = in_e.chip(0, 0);
-      } else {
-        PADDLE_THROW("unsupported pooling pooltype");
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SequencePoolGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out_g = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
-    std::string pooltype = context.Attr<std::string>("pooltype");
-
-    auto dims = in->dims();
-    auto lod = in->lod()[0];
-    int64_t w = in->numel() / dims[0];
-
-    in_g->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    if (pooltype == "MAX") {
-      math::MaxSeqPoolGradFunctor<DeviceContext, T> max_pool_grad;
-      auto* index = context.Input<Tensor>("MaxIndex");
-      max_pool_grad(dev_ctx, *out_g, *index, in_g);
-      return;
-    }
-
-    if (pooltype == "LAST" || pooltype == "FIRST") {
-      // set X@Grad be zero at first when pooltype is LAST/FIRST
-      math::SetConstant<DeviceContext, T> functor;
-      functor(dev_ctx, in_g, 0);
-    }
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-      auto in_g_t =
-          in_g->Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
-      auto out_g_t = out_g->Slice(i, i + 1);
-      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
-      auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
-      auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
-      auto out_g_e_v = EigenVector<T>::Flatten(out_g_t);
-      Eigen::DSizes<int, 2> bcast(h, 1);
-
-      if (pooltype == "AVERAGE") {
-        in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
-      } else if (pooltype == "SUM") {
-        in_g_e.device(place) = (out_g_e).broadcast(bcast);
-      } else if (pooltype == "SQRT") {
-        in_g_e.device(place) =
-            (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
-      } else if (pooltype == "LAST") {
-        in_g_e.chip(h - 1, 0).device(place) = out_g_e_v;
-      } else if (pooltype == "FIRST") {
-        in_g_e.chip(0, 0).device(place) = out_g_e_v;
-      } else {
-        PADDLE_THROW("unsupported pooling pooltype");
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/sequence_reshape_op.cc b/paddle/operators/sequence_reshape_op.cc
deleted file mode 100644
index d89a46a712c9c84a142e1e347219ed171556d761..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_reshape_op.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/operators/sequence_reshape_op.h"
-#include "paddle/framework/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-class SequenceReshapeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequenceReshapeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SequenceReshapeOp should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_numel = product(x_dims);
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2U, "Rank of Input(X) should be 2.");
-    int new_dim = ctx->Attrs().Get<int>("new_dim");
-    if (ctx->IsRuntime()) {
-      ctx->SetOutputDim("Out",
-                        {x_numel / new_dim, static_cast<int64_t>(new_dim)});
-    } else {
-      // when compiling, the batch size is undetermined, just set to -1
-      ctx->SetOutputDim("Out", {-1, static_cast<int64_t>(new_dim)});
-    }
-  }
-};
-
-class SequenceReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SequenceReshapeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor with shape "
-             "being [N, M].");
-    AddOutput("Out",
-              "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor with "
-              "shape [T, new_dim] where T is calculated based on X.lod, M and "
-              "new_dim.");
-    AddAttr<int>("new_dim", "Sequence dimension of the output LoDTensor.");
-    AddComment(R"DOC(
-Sequence Reshape Operator.
-
-This operator will rearrange the input sequences. The new dimension is set by
-attribute and length of each sequence may change longer or shorter which is
-decided by original length, original dimension and new dimension. The following
-example will help to illustrate the function of this operator:
-
-x is a LoDTensor:
-    x.lod  = [[0, 2, 6]]
-    x.data = [[1, 2], [3, 4],
-              [5, 6], [7, 8], [9, 10], [11, 12]]
-    x.dims = [6, 2]
-
-set new_dim = 4
-
-then out is a LoDTensor:
-    out.lod  = [[0, 1, 3]]
-    out.data = [[1, 2, 3, 4],
-                [5, 6, 7, 8], [9, 10, 11, 12]]
-    out.dims = [3, 4]
-
-Currently, only 1-level LoDTensor is supported and please make sure (original
-length * original dimension) can be divided by new_dim with no remainder for
-each sequence.
-
-)DOC");
-  }
-};
-
-class SequenceReshapeGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput(framework::GradVarName("Out")),
-        "Input(Out@GRAD) of SequenceReshapeGradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequenceReshapeGradOp should  not be null.");
-
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
-  }
-};
-
-class SequenceReshapeGradOpMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op_desc_ptr = new framework::OpDesc();
-    op_desc_ptr->SetType("sequence_reshape_grad");
-    op_desc_ptr->SetInput("X", Input("X"));
-    op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op_desc_ptr->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sequence_reshape, ops::SequenceReshapeOp,
-                  ops::SequenceReshapeOpMaker, ops::SequenceReshapeGradOpMaker);
-REGISTER_OPERATOR(sequence_reshape_grad, ops::SequenceReshapeGradOp);
-REGISTER_OP_CPU_KERNEL(
-    sequence_reshape,
-    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_reshape_grad,
-    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, int>);
diff --git a/paddle/operators/sequence_reshape_op.cu b/paddle/operators/sequence_reshape_op.cu
deleted file mode 100644
index d9c2f7e9a4149371867cf2a8b81d58566999bfba..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_reshape_op.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/sequence_reshape_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_reshape,
-    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    sequence_reshape_grad,
-    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>,
-    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext, int>);
diff --git a/paddle/operators/sequence_reshape_op.h b/paddle/operators/sequence_reshape_op.h
deleted file mode 100644
index aaae7ab29281b72848515b80cc60931c13a294c9..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_reshape_op.h
+++ /dev/null
@@ -1,86 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-template <typename DeviceContext, typename T>
-class SequenceReshapeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int out_width = context.Attr<int>("new_dim");
-
-    auto in_dims = in->dims();
-    int64_t in_width = in_dims[1];
-    auto& in_lod = in->lod();
-
-    PADDLE_ENFORCE_EQ(in_lod.size(), 1UL,
-                      "Only support one level sequence now.");
-    PADDLE_ENFORCE_EQ(
-        (uint64_t)in_dims[0], in_lod[0].back(),
-        "Inconsistent size between X.shape[0] and X.lod()[0].back().");
-
-    auto in_lod_l0 = in_lod[0];
-    int seq_num = in_lod_l0.size() - 1;
-
-    if (in_width == out_width) {
-      out->set_lod(in->lod());
-    } else {
-      auto& out_lod = *out->mutable_lod();
-      out_lod.resize(1);
-      out_lod[0].resize(seq_num + 1);
-      out_lod[0][0] = 0;
-      for (int i = 0; i < seq_num; ++i) {
-        size_t seq_len = in_lod_l0[i + 1] - in_lod_l0[i];
-        size_t offset = 0;
-        offset = (seq_len * in_width) / out_width;
-        PADDLE_ENFORCE_EQ(offset * out_width, seq_len * in_width,
-                          "Please make sure (sequence_length * dimension) can "
-                          "be divided by new_dim with no remainder for each "
-                          "sequence. The %dth sequence is invalid.",
-                          i + 1);
-        out_lod[0][i + 1] = out_lod[0][i] + offset;
-      }
-    }
-
-    framework::Copy(*in, context.GetPlace(), out);
-    out->Resize({static_cast<int64_t>(out->lod()[0].back()), out_width});
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SequenceReshapeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x_tensor_ptr = context.Input<LoDTensor>("X");
-    auto* outg_tensor_ptr =
-        context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* xg_tensor_ptr =
-        context.Output<LoDTensor>(framework::GradVarName("X"));
-
-    xg_tensor_ptr->mutable_data<T>(context.GetPlace());
-    framework::Copy(*outg_tensor_ptr, context.GetPlace(), xg_tensor_ptr);
-    xg_tensor_ptr->Resize(x_tensor_ptr->dims());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc
deleted file mode 100644
index f79106ff0f7a3d0918bc3a5c428179cb170ffc79..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_slice_op.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/sequence_slice_op.h"
-
-namespace paddle {
-namespace operators {
-
-class SequenceSliceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequenceSliceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Offset"),
-                   "Input(Offset) of SequenceSliceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Length"),
-                   "Input(Length) of SequenceSliceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SequenceSliceOp should not be null.");
-    auto input_dims = ctx->GetInputDim("X");
-
-    auto offset_dim = ctx->GetInputDim("Offset");
-    auto length_dim = ctx->GetInputDim("Length");
-
-    PADDLE_ENFORCE_EQ(
-        offset_dim.size(), 2UL,
-        "Only support one level sequence now, The rank of offset must be 2.");
-    PADDLE_ENFORCE_EQ(
-        length_dim.size(), 2UL,
-        "Only support one level sequence now, The rank of Length must be 2.");
-
-    // Initialize the output's dims to maximum,
-    // and re-set to real dims by the value of Offset and Length at kernel
-    ctx->SetOutputDim("Out", input_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
-  }
-};
-
-class SequenceSliceGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "The gradient of Out should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
-                   "The gradient of X should not be null.");
-    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
-  }
-};
-
-class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SequenceSliceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(LoDTensor), "
-             "the input of SequenceSliceOp.");
-    AddInput("Offset",
-             "(Tensor), "
-             "a vector<int> to describe the offset of every input sequence for "
-             "sub sequence item.");
-    AddInput("Length",
-             "(Tensor), "
-             "a vector<int> to describe the length of every input sequence for "
-             "sub sequence item.");
-    AddOutput("Out", "(LoDTensor), the output of SequenceSliceOp.");
-    AddComment(R"DOC(
-Sequence slice operator
-
-The operator crops a subsequence from given sequence with given start offset and subsequence length.
-It only supports sequence (LoD Tensor with level number is 1).
-- Case:
-    X = [[a1, a2;
-        b1, b2;
-        c1, c2]
-       [d1, d2;
-        e1, e2]]
-    LoD(X) = {{0, 3, 5}}; Dims(X) = (5, 2)
-    Offset = [[0], [1]]; Length = [[2], [1]]
-
-    Out = [[a1, a2;
-            b1, b2]
-            [e1, e2]]
-    LoD(Out) = {{0, 2, 3}}; Dims(Out) = (3, 2)
-NOTE: The first dimension size of input, the size of offset and Length, should be equal. The offset start from 0.
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(sequence_slice, ops::SequenceSliceOp, ops::SequenceSliceOpMaker,
-            sequence_slice_grad, ops::SequenceSliceGradOp);
-REGISTER_OP_CPU_KERNEL(
-    sequence_slice,
-    ops::SequenceSliceOpKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_slice_grad,
-    ops::SequenceSliceGradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_slice_op.cu b/paddle/operators/sequence_slice_op.cu
deleted file mode 100755
index 43a21d619f4116874c329eb968f09dc230975c05..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_slice_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/sequence_slice_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_slice,
-    ops::SequenceSliceOpKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    sequence_slice_grad,
-    ops::SequenceSliceGradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_slice_op.h b/paddle/operators/sequence_slice_op.h
deleted file mode 100644
index 0e4e4cf65fc69121f31886bb9909a87fea56a0be..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_slice_op.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/strided_memcpy.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using LoD = framework::LoD;
-
-template <typename T>
-inline LoD SequenceSliceLoD(const T& in, const int64_t* offset_data,
-                            const int64_t* length_data) {
-  auto out_lod = in.lod();
-  size_t lod_offset = 0;
-
-  auto n = in.lod()[0].size() - 1;
-  out_lod[0][0] = 0;
-  for (size_t i = 0; i < n; ++i) {
-    lod_offset += length_data[i];
-    out_lod[0][i + 1] = lod_offset;
-  }
-  return out_lod;
-}
-
-template <typename DeviceContext, typename T>
-class SequenceSliceOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<LoDTensor>("X");
-    auto* offset = ctx.Input<Tensor>("Offset");
-    auto* length = ctx.Input<Tensor>("Length");
-    auto* out = ctx.Output<LoDTensor>("Out");
-
-    auto lod = in->lod();
-    auto n = lod[0].size() - 1;
-
-    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
-    PADDLE_ENFORCE_EQ(
-        n, static_cast<size_t>(length->dims()[0]),
-        "The size of input-sequence and length-array should be the same");
-    PADDLE_ENFORCE_EQ(
-        n, static_cast<size_t>(offset->dims()[0]),
-        "The size of input-sequence and offset-array should be the same");
-
-    const int64_t* offset_data = offset->data<int64_t>();
-    const int64_t* length_data = length->data<int64_t>();
-    framework::Tensor offset_cpu;
-    framework::Tensor length_cpu;
-
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
-      framework::Copy(*offset, platform::CPUPlace(), ctx.device_context(),
-                      &offset_cpu);
-      offset_data = offset_cpu.data<int64_t>();
-
-      length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
-      framework::Copy(*length, platform::CPUPlace(), ctx.device_context(),
-                      &length_cpu);
-      length_data = length_cpu.data<int64_t>();
-    }
-
-    for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_LT(0, offset_data[i],
-                        "The offset[%d] must greater than zero.", i);
-      PADDLE_ENFORCE_LT(0, length_data[i],
-                        "The length[%d] must greater than zero.", i);
-      PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i],
-                        lod[0][i + 1], "The target tensor's length overflow.");
-    }
-
-    out->mutable_data<T>(ctx.GetPlace());
-    auto out_lod = SequenceSliceLoD(*in, offset_data, length_data);
-    auto out_dims = in->dims();
-    out_dims[0] = out_lod[0][out_lod[0].size() - 1];
-    out->Resize(out_dims);
-    out->set_lod(out_lod);
-
-    auto in_stride = framework::stride(in->dims());
-    auto out_stride = framework::stride(out->dims());
-
-    size_t out_offset = 0;
-    for (size_t i = 0; i < n; ++i) {
-      Tensor in_t = in->Slice(
-          static_cast<int>(lod[0][i] + offset_data[i]),
-          static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
-
-      StridedMemcpy<T>(ctx.device_context(), in_t.data<T>(), in_stride,
-                       in_t.dims(), out_stride, out->data<T>() + out_offset);
-      out_offset += length_data[i] * in_stride[0];
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<LoDTensor>("X");
-    auto* offset = ctx.Input<Tensor>("Offset");
-    auto* length = ctx.Input<Tensor>("Length");
-    auto* out_grad =
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-    auto* x_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-
-    const int64_t* offset_data = offset->data<int64_t>();
-    const int64_t* length_data = length->data<int64_t>();
-    framework::Tensor offset_cpu;
-    framework::Tensor length_cpu;
-
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
-      framework::Copy(*offset, platform::CPUPlace(), ctx.device_context(),
-                      &offset_cpu);
-      offset_data = offset_cpu.data<int64_t>();
-
-      length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
-      framework::Copy(*length, platform::CPUPlace(), ctx.device_context(),
-                      &length_cpu);
-      length_data = length_cpu.data<int64_t>();
-    }
-
-    auto lod = in->lod();
-    auto out_lod = out_grad->lod();
-
-    if (x_grad) {
-      x_grad->mutable_data<T>(ctx.GetPlace());
-      x_grad->set_lod(in->lod());
-      math::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.template device_context<DeviceContext>(), x_grad,
-               static_cast<T>(0));
-
-      auto out_grad_stride = framework::stride(out_grad->dims());
-
-      for (size_t i = 0; i < out_lod[0].size() - 1; ++i) {
-        Tensor out_grad_t =
-            out_grad->Slice(static_cast<int>(out_lod[0][i]),
-                            static_cast<int>(out_lod[0][i + 1]));
-        auto out_grad_stride = framework::stride(out_grad_t.dims());
-
-        auto x_grad_stride = framework::stride(x_grad->dims());
-
-        Tensor x_grad_t = x_grad->Slice(
-            static_cast<int>(lod[0][i] + offset_data[i]),
-            static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
-
-        StridedMemcpy<T>(ctx.device_context(), out_grad_t.data<T>(),
-                         out_grad_stride, out_grad_t.dims(), x_grad_stride,
-                         x_grad_t.data<T>());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/sequence_softmax_op.cc b/paddle/operators/sequence_softmax_op.cc
deleted file mode 100644
index b74766f012e333cc2a317e6efe17c5b60238924a..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_softmax_op.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/sequence_softmax_op.h"
-
-namespace paddle {
-namespace operators {
-
-class SequenceSoftmaxOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequenceSoftmaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SequenceSoftmaxOp should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SequenceSoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(LoDTensor) 1-D or 2-D input LoDTensor with the 2-nd dimension "
-             "of length 1.");
-    AddOutput("Out",
-              "(LoDTensor) 1-D or 2-D output LoDTensor with the 2-nd dimension "
-              "of length 1.");
-    AddComment(R"DOC(
-Sequence Softmax Operator.
-
-SequenceSoftmaxOp computes the softmax activation among all time-steps for each
-sequence. The dimension of each time-step should be 1. Thus, the shape of
-input Tensor can be either [N, 1] or [N], where N is the sum of the length
-of all sequences.
-
-The algorithm works as follows:
-
-    for i-th sequence in a mini-batch:
-
-$$
-Out(X[lod[i]:lod[i+1]], :) = \
-\frac{\exp(X[lod[i]:lod[i+1], :])} \
-{\sum(\exp(X[lod[i]:lod[i+1], :]))}
-$$
-
-For example, for a mini-batch of 3 sequences with variable-length,
-each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],
-then softmax will be computed among X[0:2, :], X[2:5, :], X[5:7, :]
-and N turns out to be 7.
-
-)DOC");
-  }
-};
-
-class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Out"),
-                   "Input(Out) of SequenceSoftmaxGradOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput(framework::GradVarName("Out")),
-        "Input(Out@GRAD) of SequenceSoftmaxGradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequenceSoftmaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output(X@GRAD) of SequenceSoftmaxOp should not be null.");
-
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputDim("Out"),
-        ctx->GetInputDim(framework::GradVarName("Out")),
-        "Input(Out) and Input(Out@GRAD) of SequenceSoftmaxGradOp should be of "
-        "the same shape.");
-
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp,
-            ops::SequenceSoftmaxOpMaker, sequence_softmax_grad,
-            ops::SequenceSoftmaxGradOp);
-REGISTER_OP_CPU_KERNEL(
-    sequence_softmax,
-    ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_softmax_grad,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_softmax_op.cu.cc b/paddle/operators/sequence_softmax_op.cu.cc
deleted file mode 100644
index 5f65b4daf97cf025b975d2d95212375b5fca01f8..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_softmax_op.cu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/sequence_softmax_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_softmax,
-    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, float>)
-REGISTER_OP_CUDA_KERNEL(
-    sequence_softmax_grad,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_softmax_op.h b/paddle/operators/sequence_softmax_op.h
deleted file mode 100644
index e889e88cb34719b6648e3032754645fbb2807741..0000000000000000000000000000000000000000
--- a/paddle/operators/sequence_softmax_op.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/softmax.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename DeviceContext, typename T>
-class SequenceSoftmaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* out = ctx.Output<LoDTensor>("Out");
-
-    auto lod = x->lod();
-    auto dims = x->dims();
-
-    const size_t level = lod.size() - 1;
-    PADDLE_ENFORCE_EQ(dims[0], static_cast<int64_t>(lod[level].back()),
-                      "The first dimension of Input(X) should be equal to the "
-                      "sum of all sequences' lengths.");
-    PADDLE_ENFORCE_EQ(dims[0], x->numel(),
-                      "The width of each timestep in Input(X) of "
-                      "SequenceSoftmaxOp should be 1.");
-
-    out->mutable_data<T>(ctx.GetPlace());
-    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
-      int start_pos = static_cast<int>(lod[level][i]);
-      int end_pos = static_cast<int>(lod[level][i + 1]);
-      Tensor x_i = x->Slice(start_pos, end_pos);
-      Tensor out_i = out->Slice(start_pos, end_pos);
-
-      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
-      framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
-      x_i.Resize(dims_i);
-      out_i.Resize(dims_i);
-      math::SoftmaxFunctor<DeviceContext, T>()(
-          ctx.template device_context<DeviceContext>(), &x_i, &out_i);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SequenceSoftmaxGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<LoDTensor>("Out");
-    auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-
-    auto lod = x->lod();
-    const size_t level = lod.size() - 1;
-
-    x_grad->mutable_data<T>(ctx.GetPlace());
-    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
-      int start_pos = static_cast<int>(lod[level][i]);
-      int end_pos = static_cast<int>(lod[level][i + 1]);
-
-      Tensor out_i = out->Slice(start_pos, end_pos);
-      Tensor out_grad_i = out_grad->Slice(start_pos, end_pos);
-      Tensor x_grad_i = x_grad->Slice(start_pos, end_pos);
-
-      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
-      framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
-      out_i.Resize(dims_i);
-      out_grad_i.Resize(dims_i);
-      x_grad_i.Resize(dims_i);
-      math::SoftmaxGradFunctor<DeviceContext, T>()(
-          ctx.template device_context<DeviceContext>(), &out_i, &out_grad_i,
-          &x_grad_i);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
deleted file mode 100644
index a11c9624ce5e8485449dd6b420ad1f23ff3550c7..0000000000000000000000000000000000000000
--- a/paddle/operators/sgd_op.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/sgd_op.h"
-
-namespace paddle {
-namespace operators {
-
-class SGDOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of SGDOp should not be null.");
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning rate should have 1 element");
-    auto param_dim = ctx->GetInputDim("Param");
-    // TODO(qijun): check dimensions of Param and Grad at complie
-    // and run time.
-    ctx->SetOutputDim("ParamOut", param_dim);
-  }
-};
-
-class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Param", "(Tensor) Input parameter");
-    AddInput("LearningRate", "(Tensor) Learning rate of SGD");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-    AddComment(R"DOC(
-
-SGD operator
-
-This operator implements one step of the stochastic gradient descent algorithm.
-
-$$param\_out = param - learning\_rate * grad$$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(sgd, ops::SGDOp, ops::SGDOpMaker);
-REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<float>, ops::SGDOpKernel<double>);
diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu
deleted file mode 100644
index 29f5aa3542c26c76a1b80da61ec6752019216131..0000000000000000000000000000000000000000
--- a/paddle/operators/sgd_op.cu
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/sgd_op.h"
-#include "paddle/platform/cuda_helper.h"
-
-namespace paddle {
-namespace operators {
-
-namespace {
-
-template <typename T>
-__global__ void SGDKernel(const T* g, const T* p, const T* learning_rate,
-                          const int num, T* p_out) {
-  T lr = learning_rate[0];
-  int grid_size = blockDim.x * gridDim.x;
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += grid_size) {
-    T g_data = g[i];
-    T p_data = p[i];
-    p_out[i] = p_data - lr * g_data;
-  }
-}
-
-template <typename T, int block_size>
-__global__ void SparseSGDFunctorKernel(const T* selected_rows,
-                                       const int64_t* rows,
-                                       const T* learning_rate, T* tensor_out,
-                                       int64_t row_numel) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  selected_rows += ty * row_numel;
-  tensor_out += rows[ty] * row_numel;
-
-  for (int index = tid; index < row_numel; index += block_size) {
-    // Since index in rows of SelectedRows can be duplicate, we have to use
-    // Atomic Operation to avoid concurrent write error.
-    paddle::platform::CudaAtomicAdd(
-        tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]);
-  }
-}
-}  // namespace
-
-template <typename T>
-class SGDOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* param = ctx.Input<framework::Tensor>("Param");
-    auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
-    auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-
-    auto* grad_var = ctx.InputVar("Grad");
-    // Actually, all tensors are LoDTensor except SelectedRows.
-    if (grad_var->IsType<framework::LoDTensor>()) {
-      param_out->mutable_data<T>(ctx.GetPlace());
-      auto* grad = ctx.Input<framework::Tensor>("Grad");
-      auto* grad_data = grad->data<T>();
-      auto* param_data = param->data<T>();
-      auto* param_out_data = param_out->data<T>();
-
-      int block = 512;
-      int grid = (param->numel() + block - 1) / block;
-
-      SGDKernel<T><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
-          grad_data, param_data, learning_rate->data<T>(), param->numel(),
-          param_out_data);
-
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
-      // This manual optimization brings difficulty to track data dependency.
-      // It's better to find a more elegant solution.
-      PADDLE_ENFORCE_EQ(param, param_out);
-      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
-
-      auto in_height = grad->height();
-      auto out_dims = param_out->dims();
-      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
-
-      auto& in_value = grad->value();
-      framework::Vector<int64_t> in_rows(grad->rows());
-
-      int64_t in_row_numel = in_value.numel() / in_rows.size();
-      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
-
-      auto* in_data = in_value.data<T>();
-      auto* out_data = param_out->data<T>();
-
-      const int block_size = 256;
-      dim3 threads(block_size, 1);
-      dim3 grid(1, in_rows.size());
-      SparseSGDFunctorKernel<
-          T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
-          in_data, in_rows.cuda_data(), learning_rate->data<T>(), out_data,
-          in_row_numel);
-
-    } else {
-      PADDLE_THROW("Unsupported Variable Type of Grad");
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(sgd, ops::SGDOpCUDAKernel<float>,
-                        ops::SGDOpCUDAKernel<double>);
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
deleted file mode 100644
index a6c544591e1172320f6cf7192bf640ff25225b99..0000000000000000000000000000000000000000
--- a/paddle/operators/sgd_op.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/selected_rows.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class SGDOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* param = ctx.Input<framework::Tensor>("Param");
-    auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
-    auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-
-    auto* grad_var = ctx.InputVar("Grad");
-    // Actually, all tensors are LoDTensor except SelectedRows.
-    if (grad_var->IsType<framework::LoDTensor>()) {
-      param_out->mutable_data<T>(ctx.GetPlace());
-      auto* grad = ctx.Input<framework::Tensor>("Grad");
-
-      auto p = framework::EigenVector<T>::Flatten(*param);
-      auto g = framework::EigenVector<T>::Flatten(*grad);
-      auto o = framework::EigenVector<T>::Flatten(*param_out);
-      auto* lr = learning_rate->data<T>();
-
-      o = p - lr[0] * g;
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
-      // This manual optimization brings difficulty to track data dependency.
-      // It's better to find a more elegant solution.
-      PADDLE_ENFORCE_EQ(param, param_out);
-      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
-
-      auto in_height = grad->height();
-      auto out_dims = param_out->dims();
-      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
-
-      auto& in_value = grad->value();
-      auto& in_rows = grad->rows();
-
-      int64_t in_row_numel = in_value.numel() / in_rows.size();
-      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
-
-      auto* in_data = in_value.data<T>();
-      auto* out_data = param_out->data<T>();
-      auto* lr = learning_rate->data<T>();
-
-      for (size_t i = 0; i < in_rows.size(); i++) {
-        for (int64_t j = 0; j < in_row_numel; j++) {
-          out_data[in_rows[i] * in_row_numel + j] -=
-              lr[0] * in_data[i * in_row_numel + j];
-        }
-      }
-    } else {
-      PADDLE_THROW("Unsupported Variable Type of Grad");
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc
deleted file mode 100644
index bf870115a4d7b6f4d578df7707826973d4363ba6..0000000000000000000000000000000000000000
--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/operators/array_operator.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-class ShrinkRNNMemoryOp : public ArrayOp {
- public:
-  ShrinkRNNMemoryOp(const std::string &type,
-                    const framework::VariableNameMap &inputs,
-                    const framework::VariableNameMap &outputs,
-                    const framework::AttributeMap &attrs)
-      : ArrayOp(type, inputs, outputs, attrs) {}
-
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    auto *x_var = scope.FindVar(Input("X"));
-    PADDLE_ENFORCE(x_var != nullptr, "Input X must be set");
-    auto &x_tensor = x_var->Get<framework::LoDTensor>();
-    size_t offset = this->GetOffset(scope, place);
-    auto *rank_table_var = scope.FindVar(Input("RankTable"));
-    PADDLE_ENFORCE(rank_table_var != nullptr, "RankTable must be set");
-    auto &rank_table = rank_table_var->Get<framework::LoDRankTable>();
-
-    auto &rank_items = rank_table.items();
-    int dst_num_rows =
-        std::lower_bound(rank_items.begin(), rank_items.end(), offset,
-                         [](const framework::LoDRankTable::TableItem &a,
-                            size_t b) { return a.length > b; }) -
-        rank_items.begin();
-
-    auto *out_var = scope.FindVar(Output("Out"));
-    PADDLE_ENFORCE(out_var != nullptr, "Output(Out) must be set.");
-    auto &out_tensor = *out_var->GetMutable<framework::LoDTensor>();
-
-    size_t height = dst_num_rows;
-
-    // do shrink for the top level LoD
-    if (x_tensor.lod().size() > 0 &&
-        x_tensor.lod()[0].size() > static_cast<size_t>(dst_num_rows)) {
-      auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(x_tensor.lod(), 0,
-                                                              dst_num_rows, 0);
-      height = lod_offset.second.second;
-      auto out_lod = out_tensor.mutable_lod();
-      framework::AppendLoD(out_lod, lod_offset.first);
-    }
-
-    if (dst_num_rows != 0) {
-      out_tensor.ShareDataWith(x_tensor.Slice(0, height));
-    }
-  }
-};
-
-class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ShrinkRNNMemoryOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(LoDTensor) The RNN step memory to be shrinked.");
-    AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN.");
-    AddInput("I",
-             "(LoDTensor) The step index. The RNN step memory 'X' will be "
-             "shrinked to match the size of the input of the index'th step.");
-    AddOutput("Out", "(LoDTensor) The shrinked RNN step memory.");
-    AddComment(R"DOC(
-This operator is used to shrink output batch of memory defined in dynamic RNN.
-
-Dynamic RNN is able to handle variable-length sequences, in which, sequences in
-a mini-batch are sorted by their lengths first. After that, the longest sequence
-becomes the first one in the sorted batch, followed by the second longest, the
-third longest, and so on. Dynamic RNN then slices a batch input timestep by
-timestep from the sorted input. Once any sequence in the input batch reaches its
-end, memory defined in dynamicRNN has to shrink its outputs to adapt to the input
-batch size for the next time step.
-)DOC");
-  }
-};
-
-class ShrinkRNNMemoryInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"));
-    PADDLE_ENFORCE(context->HasInput("I"));
-    PADDLE_ENFORCE(context->HasInput("RankTable"));
-    context->SetOutputDim("Out", context->GetInputDim("X"));
-  }
-};
-
-class ShrinkRNNMemoryGradOp : public ArrayOp {
- public:
-  ShrinkRNNMemoryGradOp(const std::string &type,
-                        const framework::VariableNameMap &inputs,
-                        const framework::VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs)
-      : ArrayOp(type, inputs, outputs, attrs) {}
-
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out")));
-    auto *dx_var = scope.FindVar(Output(framework::GradVarName("X")));
-    PADDLE_ENFORCE(dx_var != nullptr, "Input Gradient should not be nullptr");
-    auto *x_var = scope.FindVar(Input("X"));
-    PADDLE_ENFORCE(x_var != nullptr);
-
-    auto &x_tensor = x_var->Get<framework::LoDTensor>();
-    auto &dx_tensor = *dx_var->GetMutable<framework::LoDTensor>();
-    dx_tensor.Resize(x_tensor.dims());
-    dx_tensor.mutable_data(x_tensor.place(), x_tensor.type());
-
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    if (dout_var == nullptr) {  // dx_tensor fill zero
-      math::set_constant(dev_ctx, &dx_tensor, 0.0f);
-    } else {
-      auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
-      auto height = dout_tensor.dims()[0];
-      auto slice = dx_tensor.Slice(0, static_cast<int>(height));
-      framework::Copy(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
-      if (dx_tensor.dims()[0] > height) {
-        auto rest_tensor = dx_tensor.Slice(
-            static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));
-        math::set_constant(dev_ctx, &rest_tensor, 0.0f);
-      }
-    }
-    dx_tensor.set_lod(x_tensor.lod());
-  }
-};
-
-class ShrinkRNNMemoryGradInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"));
-    PADDLE_ENFORCE(context->HasOutput(framework::GradVarName("X")));
-    context->SetOutputDim(framework::GradVarName("X"),
-                          context->GetInputDim("X"));
-    context->ShareLoD("X", framework::GradVarName("X"));
-  }
-};
-
-class ShrinkRNNGradOpMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *op = new framework::OpDesc();
-    op->SetType("shrink_rnn_memory_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(shrink_rnn_memory, ops::ShrinkRNNMemoryOp,
-                  ops::ShrinkRNNMemoryInferShape,
-                  ops::ShrinkRNNMemoryOpProtoMaker, ops::ShrinkRNNGradOpMaker);
-REGISTER_OPERATOR(shrink_rnn_memory_grad, ops::ShrinkRNNMemoryGradOp,
-                  ops::ShrinkRNNMemoryGradInferShape);
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
deleted file mode 100644
index c526a88a127da12a6384777bca31b60873844d94..0000000000000000000000000000000000000000
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/sigmoid_cross_entropy_with_logits_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
-                      "Input(Label)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
-                      "The 1st dimension of Input(X) and Input(Label) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
-                      "The 2nd dimension of Input(X) and Input(Label) should "
-                      "be equal.");
-
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class SigmoidCrossEntropyWithLogitsGradOp
-    : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) shoudl be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output(X@GRAD) should be not null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Label");
-    auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
-                      "Input(Label)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(dout_dims.size(), 2,
-                      "Input(Out@Grad)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
-                      "The 1st dimension of Input(X) and Input(Label) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
-                      "The 2nd dimension of Input(X) and Input(Label) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(x_dims[0], dout_dims[0],
-                      "The 1st dimension of Input(X) and Input(Out@Grad) "
-                      "should be equal.");
-    PADDLE_ENFORCE_EQ(x_dims[1], dout_dims[1],
-                      "The 2nd dimension of Input(X) and Input(Out@Grad) "
-                      "should be equal.");
-
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-  }
-};
-
-class SigmoidCrossEntropyWithLogitsOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  SigmoidCrossEntropyWithLogitsOpMaker(OpProto* proto,
-                                       OpAttrChecker* op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "
-             "where N is the batch size and D is the number of classes. "
-             "This input is a tensor of logits computed by the previous "
-             " operator. Logits are unscaled log probabilities given as "
-             "log(p/(1-p)).");
-    AddInput("Label",
-             "(Tensor, default Tensor<float>), a 2-D tensor of the same type "
-             "and shape as X. This input is a tensor of probabalistic labels "
-             "for each logit");
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D "
-              " of elementwise logistic losses.");
-    AddComment(R"DOC(
-SigmoidCrossEntropyWithLogits Operator.
-
-This measures the element-wise probability error in classification tasks
-in which each class is independent. This can be thought of as predicting labels
-for a data-point, where labels are not mutually exclusive.
-For example, a news article can be about politics, technology or sports
-at the same time or none of these.
-
-The logistic loss is given as follows:
-
-       $$loss = -Labels * \log(\sigma(X)) - (1 - Labels) * \log(1 - \sigma(X))$$
-
-We know that $$\sigma(X) = (1 / (1 + \exp(-X)))$$. By substituting this we get:
-
-       $$loss = X - X * Labels + \log(1 + \exp(-X))$$
-
-For stability and to prevent overflow of $$\exp(-X)$$ when X < 0,
-we reformulate the loss as follows:
-
-       $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-|X|))$$
-
-Both the input `X` and `Labels` can carry the LoD (Level of Details) information.
-However the output only shares the LoD with input `X`.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(sigmoid_cross_entropy_with_logits,
-            ops::SigmoidCrossEntropyWithLogitsOp,
-            ops::SigmoidCrossEntropyWithLogitsOpMaker,
-            sigmoid_cross_entropy_with_logits_grad,
-            ops::SigmoidCrossEntropyWithLogitsGradOp);
-REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits,
-                       ops::SigmoidCrossEntropyWithLogitsKernel<
-                           paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
-                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
deleted file mode 100644
index 3f393265f48b428dca8703ff77688de979fb63df..0000000000000000000000000000000000000000
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/sigmoid_cross_entropy_with_logits_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits,
-                        ops::SigmoidCrossEntropyWithLogitsKernel<
-                            paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad,
-                        ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
deleted file mode 100644
index b78bcc436e9fa5c5d4db3fbb22224e328c3bc3c2..0000000000000000000000000000000000000000
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
-template <typename DeviceContext, typename T>
-class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
-    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
-    Out->mutable_data<T>(context.GetPlace());
-
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto labels = framework::EigenVector<T>::Flatten(*Labels);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
-    auto &place = *context.device_context<DeviceContext>().eigen_device();
-
-    // term1 = max(x, 0)
-    auto term1 = x.cwiseMax(static_cast<T>(0));
-    // term2 = x * labels
-    auto term2 = x * labels;
-    // term3 = log(1 + exp(-abs(x)))
-    auto term3 = (static_cast<T>(1) + (-(x.abs())).exp()).log();
-
-    out.device(place) = term1 - term2 + term3;
-  }
-};
-
-// dX = sigmoid(X) - labels
-template <typename DeviceContext, typename T>
-class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
-    const framework::Tensor *dOut =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    framework::Tensor *dX =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    dX->mutable_data<T>(context.GetPlace());
-
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto labels = framework::EigenVector<T>::Flatten(*Labels);
-    auto dout = framework::EigenVector<T>::Flatten(*dOut);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto &place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto sigmoid_x = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
-    dx.device(place) = dout * (sigmoid_x - labels);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/sign_op.cc b/paddle/operators/sign_op.cc
deleted file mode 100644
index f63eaa4464cc668acdb8e5b8a74ad5bba936db44..0000000000000000000000000000000000000000
--- a/paddle/operators/sign_op.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/sign_op.h"
-
-namespace paddle {
-namespace operators {
-
-class SignOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SignOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SignOp should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-template <typename AttrType>
-class SignOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) Input tensor of sign operator.");
-    AddOutput("Out", "(Tensor) Output tensor of sign operator.");
-    AddComment(R"DOC(
-Sign operator
-
-$$Out = X.sign()$$
-)DOC");
-  }
-};
-
-class SignGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("scale");
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetAttr("scale", 0.0f);
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker<float>,
-                  ops::SignGradMaker);
-REGISTER_OP_CPU_KERNEL(
-    sign, ops::SignKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sign_op.cu b/paddle/operators/sign_op.cu
deleted file mode 100644
index f224880cffb2154a7c46b8a4701d7357e67bb70c..0000000000000000000000000000000000000000
--- a/paddle/operators/sign_op.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/sign_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    sign,
-    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sign_op.h b/paddle/operators/sign_op.h
deleted file mode 100644
index 9fe49ae1a2161d9f1472eef830c11b0f8305c568..0000000000000000000000000000000000000000
--- a/paddle/operators/sign_op.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class SignKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    out->mutable_data<T>(in->place());
-
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    eigen_out.device(place) = eigen_in.sign();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc
deleted file mode 100644
index dcb18d729da69beaa556e4b93129dafb08b72c06..0000000000000000000000000000000000000000
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/smooth_l1_loss_op.h"
-
-namespace paddle {
-namespace operators {
-
-class SmoothL1LossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(x_dims, y_dims);
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "The tensor rank of Input(X) should not be less than 2.");
-    if (ctx->HasInput("InsideWeight")) {
-      PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"),
-                     "If weights are provided, must specify both "
-                     "inside and outside weights.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims);
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims);
-    }
-
-    ctx->SetOutputDim("Diff", x_dims);
-    // loss is a two-rank tensor
-    ctx->SetOutputDim("Out", {x_dims[0], 1});
-  }
-};
-
-template <typename AttrType>
-class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SmoothL1LossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
-             "The input value of smooth l1 loss op with shape "
-             "[batch_size, dim1, ..., dimN].");
-    AddInput("Y",
-             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
-             "The target value of smooth l1 loss op with same shape as X.");
-    AddInput("InsideWeight",
-             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
-             "This input is optional and should have same shape with X. "
-             "If provided, the result of (X - Y) will be multiplied "
-             "by this tensor element by element.")
-        .AsDispensable();
-    AddInput("OutsideWeight",
-             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
-             "This input is optional and should have same shape with X. "
-             "If provided, the out smooth l1 loss will be multiplied by this "
-             "tensor element by element.")
-        .AsDispensable();
-    AddOutput("Diff", "Intermediate variable to cache InsideWeight * (X - Y).")
-        .AsIntermediate();
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>) A tensor with rank be 2. "
-              "The output smooth l1 loss with shape [batch_size, 1].");
-    AddAttr<AttrType>("sigma",
-                      "Hyper parameter of smooth l1 loss op."
-                      "A float scalar with default value 3.0.")
-        .SetDefault(3.0);
-    AddComment(R"DOC(
-Smooth L1 Loss Operator.
-
-This operator computes the smooth l1 loss for X and Y.
-The operator takes the first dimension of X and Y as batch size.
-For each instance, it computes the smooth l1 loss element by element first
-and then sums all the losses. So the shape of Out is [batch_size, 1].
-
-The equation is:
-$$
-Out_{\sigma}(X, Y)_i = \begin{cases}
-0.5 * (\sigma * (X_i - Y_i)) ^ 2
-\quad |X_i - Y_i| \lt \frac{1} {{\sigma} ^ 2} \\
-\frac{|X_i - Y_i| - 0.5}{{\sigma}^2},
-\quad otherwise
-\end{cases}
-$$
-
-In the above equation, $Out_{\sigma}(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
-element of Out, X and Y.
-
-)DOC");
-  }
-};
-
-class SmoothL1LossGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim("X");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_GE(out_dims.size(), 2,
-                      "The tensor rank of Input(Out@Grad) should be 2.");
-    PADDLE_ENFORCE_EQ(out_dims[0], in_dims[0],
-                      "The 1st dimension of Input(Out@Grad) must be "
-                      "same as input.");
-    PADDLE_ENFORCE_EQ(out_dims[1], 1,
-                      "The 2nd dimension of Input(Out@Grad) must be 1.");
-
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, in_dims);
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, in_dims);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(smooth_l1_loss, ops::SmoothL1LossOp,
-            ops::SmoothL1LossOpMaker<float>, smooth_l1_loss_grad,
-            ops::SmoothL1LossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    smooth_l1_loss,
-    ops::SmoothL1LossKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    smooth_l1_loss_grad,
-    ops::SmoothL1LossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/smooth_l1_loss_op.cu b/paddle/operators/smooth_l1_loss_op.cu
deleted file mode 100644
index 213429bc370ef0d5b493b3a448df1b3bf0e4e87c..0000000000000000000000000000000000000000
--- a/paddle/operators/smooth_l1_loss_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-
-#include "paddle/operators/smooth_l1_loss_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    smooth_l1_loss,
-    ops::SmoothL1LossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    smooth_l1_loss_grad,
-    ops::SmoothL1LossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/smooth_l1_loss_op.h b/paddle/operators/smooth_l1_loss_op.h
deleted file mode 100644
index 3facfae116d711f86ea5c193562c20ea60a2efc9..0000000000000000000000000000000000000000
--- a/paddle/operators/smooth_l1_loss_op.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-struct SmoothL1LossForward {
-  HOSTDEVICE SmoothL1LossForward(const T& sigma2) : sigma2(sigma2) {}
-
-  HOSTDEVICE T operator()(const T& val) const {
-    T abs_val = std::abs(val);
-    if (abs_val < 1.0 / sigma2) {
-      return 0.5 * val * val * sigma2;
-    } else {
-      return abs_val - 0.5 / sigma2;
-    }
-  }
-
-  T sigma2;
-};
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class SmoothL1LossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto* in1 = context.Input<Tensor>("Y");
-    auto* in2 = context.Input<Tensor>("InsideWeight");
-    auto* in3 = context.Input<Tensor>("OutsideWeight");
-    auto* out0 = context.Output<Tensor>("Diff");
-    auto* out1 = context.Output<Tensor>("Out");
-
-    out0->mutable_data<T>(context.GetPlace());
-    out1->mutable_data<T>(context.GetPlace());
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-
-    auto sigma = static_cast<T>(context.Attr<AttrType>("sigma"));
-    T sigma2 = sigma * sigma;
-    bool has_weight = (in2 != nullptr) && (in3 != nullptr);
-
-    auto x = EigenVector<T>::Flatten(*in0);
-    auto y = EigenVector<T>::Flatten(*in1);
-    auto diff = EigenVector<T>::Flatten(*out0);
-
-    diff.device(*place) = x - y;
-    // multiply inside weight
-    if (has_weight) {
-      auto inside_weight = EigenVector<T>::Flatten(*in2);
-      // cache diff, reused in bp
-      diff.device(*place) = diff * inside_weight;
-    }
-
-    auto in_counts = in0->numel();
-    Tensor ptensor_errors;
-    ptensor_errors.mutable_data<T>({static_cast<int>(in_counts)},
-                                   context.GetPlace());
-    auto errors = EigenVector<T>::Flatten(ptensor_errors);
-    // apply smooth l1 forward
-    errors.device(*place) = diff.unaryExpr(SmoothL1LossForward<T>(sigma2));
-
-    // multiply outside weight
-    if (has_weight) {
-      auto outside_weight = EigenVector<T>::Flatten(*in3);
-      errors.device(*place) = errors * outside_weight;
-    }
-    auto loss = EigenVector<T>::Flatten(*out1);
-    // first dimension of 'X' is the number of samples
-    auto mat_dims =
-        framework::make_ddim({static_cast<int>(in0->dims()[0]),
-                              static_cast<int>(in_counts / in0->dims()[0])});
-    auto errors_mat_view = EigenMatrix<T>::From(ptensor_errors, mat_dims);
-    loss.device(*place) = errors_mat_view.sum(Eigen::array<int, 1>({{1}}));
-  }
-};
-
-template <typename T>
-struct SmoothL1LossBackward {
-  HOSTDEVICE SmoothL1LossBackward(const T& sigma2) : sigma2(sigma2) {}
-
-  HOSTDEVICE T operator()(const T& val) const {
-    T abs_val = std::abs(val);
-    if (abs_val < 1.0 / sigma2) {
-      return sigma2 * val;
-    } else {
-      return (0 < val) - (val < 0);
-    }
-  }
-
-  T sigma2;
-};
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class SmoothL1LossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("InsideWeight");
-    auto* in1 = context.Input<Tensor>("OutsideWeight");
-    auto* in2 = context.Input<Tensor>("Diff");
-    auto* og = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto sigma = static_cast<T>(context.Attr<AttrType>("sigma"));
-    T sigma2 = sigma * sigma;
-    bool has_weight = (in0 != nullptr) && (in1 != nullptr);
-
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-
-    auto in_dims = in2->dims();
-    auto counts = in2->numel();
-    auto cols = counts / in_dims[0];
-    auto mat_dims = framework::make_ddim(
-        {static_cast<int>(in_dims[0]), static_cast<int>(cols)});
-
-    Tensor ptensor_diff;
-    ptensor_diff.mutable_data<T>({static_cast<int>(counts)},
-                                 context.GetPlace());
-    auto diff = EigenVector<T>::Flatten(ptensor_diff);
-    // apply smooth l1 backwoard
-    diff.device(*place) = EigenVector<T>::Flatten(*in2).unaryExpr(
-        SmoothL1LossBackward<T>(sigma2));
-
-    // compute weights
-    Tensor ptensor_weights;
-    ptensor_weights.mutable_data<T>(mat_dims, context.GetPlace());
-    auto weights = EigenMatrix<T>::From(ptensor_weights);
-    // initialize to 1.0
-    weights.device(*place) = weights.constant(static_cast<T>(1.0));
-    if (has_weight) {
-      auto inside_weight = EigenMatrix<T>::From(*in0, mat_dims);
-      auto outside_weight = EigenMatrix<T>::From(*in1, mat_dims);
-      weights.device(*place) = inside_weight * outside_weight;
-    }
-
-    // compute gradients
-    auto out_grad = EigenMatrix<T>::From(*og);
-    auto diff_mat_view = EigenMatrix<T>::From(ptensor_diff, mat_dims);
-    auto gradients = out_grad.broadcast(
-                         Eigen::array<int, 2>({{1, static_cast<int>(cols)}})) *
-                     weights * diff_mat_view;
-
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* out1 = context.Output<Tensor>(framework::GradVarName("Y"));
-
-    if (out0) {
-      out0->mutable_data<T>(context.GetPlace());
-      auto x_grad = EigenMatrix<T>::From(*out0, mat_dims);
-      x_grad.device(*place) = gradients;
-    }
-
-    if (out1) {
-      out1->mutable_data<T>(context.GetPlace());
-      auto y_grad = EigenMatrix<T>::From(*out1, mat_dims);
-      y_grad.device(*place) = -1 * gradients;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
deleted file mode 100644
index cef1f1fc99d6a34db068257f94f935bf4435119d..0000000000000000000000000000000000000000
--- a/paddle/operators/softmax_op.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/softmax_op.h"
-
-namespace paddle {
-namespace operators {
-
-class SoftmaxOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SoftmaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SoftmaxOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE(x_dims.size() == 2UL,
-                   "The input of softmax op must be a matrix.");
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "The input tensor of softmax. "
-             "2-D with shape [batch_size, input_feature_dimensions].");
-    AddOutput("Out", "The normalized values with the same shape as X.");
-    AddComment(R"DOC(
-Softmax Operator.
-
-The input of the softmax operator is a 2-D tensor with shape N x K (N is the
-batch_size, K is the dimension of input feature). The output tensor has the
-same shape as the input tensor.
-
-For each row of the input tensor, the softmax operator squashes the
-K-dimensional vector of arbitrary real values to a K-dimensional vector of real
-values in the range [0, 1] that add up to 1.
-It computes the exponential of the given dimension and the sum of exponential
-values of all the other dimensions in the K-dimensional vector input.
-Then the ratio of the exponential of the given dimension and the sum of
-exponential values of all the other dimensions is the output of the softmax
-operator.
-
-For each row $i$ and each column $j$ in Input(X), we have:
-    $$Out[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$
-
-)DOC");
-  }
-};
-
-class SoftmaxOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should be not null.");
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Out"),
-                      ctx->GetInputDim(framework::GradVarName("Out")),
-                      "Input(Out) and its gradients should have a same shape.");
-
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, softmax_grad,
-            ops::SoftmaxOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    softmax_grad,
-    ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/softmax_op.cu.cc b/paddle/operators/softmax_op.cu.cc
deleted file mode 100644
index e7da40f3e82d5db858a795e9634abf57b884d6a2..0000000000000000000000000000000000000000
--- a/paddle/operators/softmax_op.cu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/softmax_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    softmax, ops::SoftmaxKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    softmax_grad,
-    ops::SoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
deleted file mode 100644
index 63e379a3b31a6c75aab0c56b4ce1b988fa7f0318..0000000000000000000000000000000000000000
--- a/paddle/operators/softmax_op.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/softmax.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class SoftmaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<Tensor>("X");
-    auto* Out = context.Output<Tensor>("Out");
-
-    // allocate memory on device.
-    Out->mutable_data<T>(context.GetPlace());
-
-    math::SoftmaxFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), X, Out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SoftmaxGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* Out = context.Input<Tensor>("Out");
-    auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
-
-    // allocate memory on device.
-    dX->mutable_data<T>(context.GetPlace());
-
-    math::SoftmaxGradFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), Out, dOut, dX);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
deleted file mode 100644
index 7135780c92dca00503ab098dc6930afd8fac0be8..0000000000000000000000000000000000000000
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ /dev/null
@@ -1,204 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/softmax_with_cross_entropy_op.h"
-
-namespace paddle {
-namespace operators {
-
-class SoftmaxWithCrossEntropyOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  SoftmaxWithCrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Logits",
-             "(Tensor, default: Tensor<float>), The unscaled log probabilities "
-             "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
-             "and K is the class number.");
-    AddInput("Label",
-             "(Tensor) The ground truth which is a 2-D tensor. If soft_label "
-             "is set to false, Label is a Tensor<int64> with shape [N x 1]. If "
-             "soft_label is set to true, Label is a Tensor<float/double> with "
-             "shape [N x K].");
-    AddOutput(
-        "Softmax",
-        "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N x K]. "
-        "The outputs value of softmax activation by given the input batch, "
-        "which will be used in backward calculation.")
-        .AsIntermediate();
-    AddOutput("Loss",
-              "(Tensor, default: Tensor<float>), A 2-D tensor. The cross "
-              "entropy loss with shape [N x 1].");
-    AddAttr<bool>(
-        "soft_label",
-        "(bool, default: false), A flag to indicate whether to interpretate "
-        "the given labels as soft labels.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-Softmax With Cross Entropy Operator.
-
-Cross entropy loss with softmax is used as the output layer extensively. This
-operator computes the softmax normalized values for each row of the input
-tensor, after which cross-entropy loss is computed. This provides a more
-numerically stable gradient.
-
-Because this operator performs a softmax on logits internally, it expects
-unscaled logits. This operator should not be used with the output of
-softmax operator since that would produce incorrect results.
-
-When the attribute soft_label is set false, this operators expects mutually
-exclusive hard labels, each sample in a batch is in exactly one class with a
-probability of 1.0. Each sample in the batch will have a single label.
-
-The equation is as follows:
-
-1) Hard label (one-hot label, so every sample has exactly one class)
-
-$$Loss_j =  -\text{Logit}_{Label_j} +
-\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right),
-j = 1,..., K$$
-
-2) Soft label (each sample can have a distribution over all classes)
-
-$$Loss_j =  -\sum_{i=0}^{K}\text{Label}_i \left(\text{Logit}_i -
-\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right),
-j = 1,...,K$$
-
-)DOC");
-  }
-};
-
-class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Logits"),
-                   "Input(Logits) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("Softmax"),
-                   "Output(Softmax) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Loss"), "Output(Loss) should be not null.");
-
-    auto logits_dims = ctx->GetInputDim("Logits");
-    auto labels_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(
-        logits_dims.size(), 2UL,
-        "The input of softmax_with_cross_entropy should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
-                      "The labels should be a 2-D tensor.");
-
-    if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(logits_dims[1], labels_dims[1],
-                        "If Attr(soft_label) == true, the 2nd dimension of "
-                        "Input(X) and Input(Label) should be equal.");
-    } else {
-      PADDLE_ENFORCE_EQ(labels_dims[1], 1UL,
-                        "If Attr(soft_label) == false, the 2nd dimension of "
-                        "Input(Label) should be 1.");
-    }
-
-    ctx->SetOutputDim("Softmax", logits_dims);
-    ctx->SetOutputDim("Loss", {logits_dims[0], 1});
-
-    ctx->ShareLoD("Logits", /*->*/ "Softmax");
-    ctx->ShareLoD("Logits", /*->*/ "Loss");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
-        ctx.device_context());
-  }
-};
-
-class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
-                   "Input(Loss@Grad) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Softmax"),
-                   "Input(Softmax) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
-                   "Output(Logits@Grad) should be not null.");
-
-    auto softmax_dims = ctx->GetInputDim("Softmax");
-    auto labels_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
-                      "The labels should be a 2-D tensor.");
-
-    if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(softmax_dims[1], labels_dims[1],
-                        "When Attr(soft_label) == true, the 2nd dimension of "
-                        "Input(X) and Input(Label) should be equal.");
-    } else {
-      PADDLE_ENFORCE_EQ(labels_dims[1], 1UL,
-                        "When Attr(soft_label) == false, the 2nd dimension of "
-                        "Input(Label) should be 1.");
-    }
-
-    ctx->SetOutputDim(framework::GradVarName("Logits"),
-                      ctx->GetInputDim("Softmax"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<Tensor>(framework::GradVarName("Loss"))->type()),
-        ctx.device_context());
-  }
-};
-
-class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* grad_op = new framework::OpDesc();
-    grad_op->SetType("softmax_with_cross_entropy_grad");
-    grad_op->SetInput("Label", Input("Label"));
-    grad_op->SetInput("Softmax", Output("Softmax"));
-    grad_op->SetInput("Loss", Output("Loss"));
-    grad_op->SetInput(framework::GradVarName("Softmax"), OutputGrad("Softmax"));
-    grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
-    grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp,
-                  ops::SoftmaxWithCrossEntropyOpMaker, ops::SoftmaxGradMaker);
-REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
-                  ops::SoftmaxWithCrossEntropyOpGrad);
-REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy,
-                       ops::SoftmaxWithCrossEntropyKernel<float>,
-                       ops::SoftmaxWithCrossEntropyKernel<double>);
-REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad,
-                       ops::SoftmaxWithCrossEntropyGradKernel<float>,
-                       ops::SoftmaxWithCrossEntropyGradKernel<double>);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
deleted file mode 100644
index 61583c6161c3bbc62788dc8b6940ddcc29b2302a..0000000000000000000000000000000000000000
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-
-#include "paddle/operators/softmax_with_cross_entropy_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-namespace {
-template <typename T>
-__global__ void CrossEntropyGrad(T* logit_grad, const T* loss_grad,
-                                 const int64_t* labels, const int batch_size,
-                                 const int class_num) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int sample_idx = tid / class_num;
-
-  if (tid < batch_size) {
-    PADDLE_ASSERT(labels[sample_idx] >= 0 && labels[sample_idx] < class_num);
-    logit_grad[tid * class_num + labels[tid]] -= static_cast<T>(1.);
-  }
-
-  __syncthreads();
-
-  if (tid < batch_size * class_num) {
-    logit_grad[tid] *= loss_grad[sample_idx];
-  }
-}
-
-template <typename T>
-__global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
-                                               const T* loss_grad,
-                                               const T* labels,
-                                               const int batch_size,
-                                               const int class_num) {
-  int ids = blockIdx.x * blockDim.x + threadIdx.x;
-  if (ids < batch_size * class_num) {
-    int row_ids = ids / class_num;
-    logit_grad[ids] = loss_grad[row_ids] * (logit_grad[ids] - labels[ids]);
-  }
-}
-}  // namespace
-
-template <typename T>
-class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    const Tensor* logits = context.Input<Tensor>("Logits");
-    const Tensor* labels = context.Input<Tensor>("Label");
-    Tensor* softmax = context.Output<Tensor>("Softmax");
-
-    Tensor* loss = context.Output<Tensor>("Loss");
-    softmax->mutable_data<T>(context.GetPlace());
-    loss->mutable_data<T>(context.GetPlace());
-
-    math::SoftmaxFunctor<platform::CUDADeviceContext, T>()(
-        context.cuda_device_context(), logits, softmax);
-    math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
-        context.cuda_device_context(), loss, softmax, labels,
-        context.Attr<bool>("soft_label"));
-  }
-};
-
-template <typename T>
-class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    const Tensor* labels = context.Input<Tensor>("Label");
-    const T* loss_grad_data =
-        context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
-    Tensor* logit_grad =
-        context.Output<Tensor>(framework::GradVarName("Logits"));
-    logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
-    T* logit_grad_data = logit_grad->data<T>();
-
-    const int batch_size = logit_grad->dims()[0];
-    const int class_num = logit_grad->dims()[1];
-    int block = 512;
-    int grid = (batch_size * class_num + block - 1) / block;
-
-    if (context.Attr<bool>("soft_label")) {
-      const T* label_data = labels->data<T>();
-      SoftCrossEntropyGradientKernel<
-          T><<<grid, block, 0,
-               context.template device_context<platform::CUDADeviceContext>()
-                   .stream()>>>(logit_grad_data, loss_grad_data, label_data,
-                                batch_size, class_num);
-    } else {
-      const int64_t* label_data = labels->data<int64_t>();
-      CrossEntropyGrad<
-          T><<<grid, block, 0,
-               context.template device_context<platform::CUDADeviceContext>()
-                   .stream()>>>(logit_grad_data, loss_grad_data, label_data,
-                                batch_size, class_num);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy,
-                        ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
-                        ops::SoftmaxWithCrossEntropyCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy_grad,
-                        ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
-                        ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
deleted file mode 100644
index 6bde0f37e06ccf7d81487e0c99227287787c5d72..0000000000000000000000000000000000000000
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/cross_entropy.h"
-#include "paddle/operators/math/softmax.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()),
-                   "This kernel only runs on CPU.");
-    const Tensor* logits = context.Input<Tensor>("Logits");
-    const Tensor* labels = context.Input<Tensor>("Label");
-    Tensor* softmax = context.Output<Tensor>("Softmax");
-    Tensor* loss = context.Output<Tensor>("Loss");
-
-    softmax->mutable_data<T>(context.GetPlace());
-    loss->mutable_data<T>(context.GetPlace());
-
-    auto& dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
-    math::SoftmaxFunctor<platform::CPUDeviceContext, T>()(dev_ctx, logits,
-                                                          softmax);
-    math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
-        dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"));
-  }
-};
-
-template <typename T>
-class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* out_grad =
-        context.Input<Tensor>(framework::GradVarName("Loss"));
-    const Tensor* labels = context.Input<Tensor>("Label");
-    Tensor* logit_grad =
-        context.Output<Tensor>(framework::GradVarName("Logits"));
-    logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
-
-    const int class_num = logit_grad->dims()[1];
-    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
-    auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
-    auto& place = *context.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    if (context.Attr<bool>("soft_label")) {
-      auto lbl_mat = EigenMatrix<T>::From(*labels);
-      logit_grad_mat.device(place) =
-          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) *
-          (logit_grad_mat - lbl_mat);
-    } else {
-      logit_grad_mat.device(place) =
-          logit_grad_mat *
-          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num));
-
-      const int batch_size = logit_grad->dims()[0];
-      const int64_t* label_data = labels->data<int64_t>();
-      T* logit_grad_data = logit_grad->data<T>();
-      const T* out_grad_data = out_grad->data<T>();
-      for (int i = 0; i < batch_size; ++i) {
-        logit_grad_data[i * class_num + label_data[i]] -= out_grad_data[i];
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc
deleted file mode 100644
index bd93c492015e074afe08ee167025aa6251b369d1..0000000000000000000000000000000000000000
--- a/paddle/operators/split_lod_tensor_op.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/memory/memcpy.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-struct CopyRange {
-  size_t begin;
-  size_t end;
-};
-
-using LoD = framework::LoD;
-
-class SplitLoDTensorOp : public framework::OperatorBase {
- public:
-  SplitLoDTensorOp(const std::string &type,
-                   const framework::VariableNameMap &inputs,
-                   const framework::VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
-    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
-    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
-    auto *out_true =
-        scope.FindVar(Output("OutTrue"))->GetMutable<framework::LoDTensor>();
-    auto *out_false =
-        scope.FindVar(Output("OutFalse"))->GetMutable<framework::LoDTensor>();
-    auto level = static_cast<size_t>(Attr<int>("level"));
-    auto &x_lod = x.lod();
-    auto &mask_dim = mask.dims();
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-
-    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
-    if (platform::is_cpu_place(mask.place())) {
-      cpu_mask->ShareDataWith(mask);
-    } else if (platform::is_gpu_place(mask.place())) {
-#ifdef PADDLE_WITH_CUDA
-      framework::Copy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
-#else
-      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
-#endif
-    }
-    auto *mask_data = cpu_mask->data<bool>();
-
-    std::vector<std::vector<CopyRange>> copy_ranges(mask_dim[0]);
-
-    // set out_true/out_false lod
-    for (size_t t = 0; t < 2; t++) {
-      LoD *lod = nullptr;
-      if (t == 0) {
-        lod = out_false->mutable_lod();
-      } else {
-        lod = out_true->mutable_lod();
-      }
-      lod->clear();
-      for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
-        if (static_cast<size_t>(mask_data[i]) == t) {
-          size_t start_idx = i;
-          auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
-              x_lod, start_idx, start_idx + 1, level);
-
-          auto &lod_length = lod_and_offset.first;
-          framework::AppendLoD(lod, lod_length);
-
-          size_t start_offset = lod_and_offset.second.first;
-          size_t end_offset = lod_and_offset.second.second;
-          copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
-        }
-      }
-    }
-
-    for (size_t t = 0; t < 2; ++t) {
-      framework::LoDTensor *out;
-      if (t == 0) {
-        out = out_false;
-      } else {
-        out = out_true;
-      }
-      auto &ranges = copy_ranges[t];
-      size_t height = std::accumulate(
-          ranges.begin(), ranges.end(), 0UL,
-          [](size_t a, const CopyRange &b) { return a + b.end - b.begin; });
-      auto x_dim = x.dims();
-      x_dim[0] = static_cast<int64_t>(height);
-      out->Resize(x_dim);
-      out->mutable_data(x.place(), x.type());
-      size_t offset = 0;
-      for (auto &each_range : ranges) {
-        size_t len = each_range.end - each_range.begin;
-        if (len == 0) {
-          continue;
-        }
-        // out[offset: offset+len] = x[each_range.begin: each_range.end]
-        auto slice = out->Slice(static_cast<int>(offset),
-                                static_cast<int>(offset + len));
-        framework::Copy(x.Slice(static_cast<int>(each_range.begin),
-                                static_cast<int>(each_range.end)),
-                        x.place(), dev_ctx, &slice);
-        offset += len;
-      }
-    }
-  }
-};
-
-class SplitLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SplitLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input LoDTensor");
-    AddInput("Mask", "A bool column vector which mask the input");
-    AddOutput("OutTrue", "True branch of input LoDTensor");
-    AddOutput("OutFalse", "False branch of input LoDTensor");
-    AddAttr<int>("level", "(int) the specific lod level to split.")
-        .SetDefault(0)
-        .EqualGreaterThan(0);
-    AddComment(
-        R"DOC(
-        Split a LoDTensor with a Mask at certain level. The input LoDTensor
-        has 3 sequence at certain lod level. The Mask is a bool column vector,
-        such as [0, 1, 0] at the same level. The first and third sequence will
-        be send to False Output LoDTensor; whereas the second sequence will
-        be send to True Output LoDTensor. Please refer to MergeLoDTensorOp.)DOC");
-  }
-};
-
-class SplitLoDTensorInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("X"),
-                   "SplitLoDTensorOp must has input X.");
-    PADDLE_ENFORCE(context->HasInput("Mask"),
-                   "SplitLoDTensorOp must has input Mask.");
-    PADDLE_ENFORCE(context->HasOutput("OutTrue"),
-                   "SplitLoDTensorOp must has output OutTrue.");
-    PADDLE_ENFORCE(context->HasOutput("OutFalse"),
-                   "SplitLoDTensorOp must has output OutFalse.");
-
-    auto mask_dim = context->GetInputDim("Mask");
-    PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
-    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
-
-    context->SetOutputDim("OutTrue", context->GetInputDim("X"));
-    context->SetOutputDim("OutFalse", context->GetInputDim("X"));
-  }
-};
-
-class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("merge_lod_tensor");
-    grad_op->SetInput("InTrue", OutputGrad("OutTrue"));
-    grad_op->SetInput("InFalse", OutputGrad("OutFalse"));
-    grad_op->SetInput("Mask", Input("Mask"));
-    grad_op->SetInput("X", Input("X"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(split_lod_tensor, ops::SplitLoDTensorOp,
-                  ops::SplitLoDTensorOpProtoMaker,
-                  ops::SplitLoDTensorInferShape,
-                  ops::SplitLoDTensorArrayGradMaker);
diff --git a/paddle/operators/split_op.cc b/paddle/operators/split_op.cc
deleted file mode 100644
index 8d55ae5dd7b0e76acb9f21cb10b79cb7aca18a8d..0000000000000000000000000000000000000000
--- a/paddle/operators/split_op.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/split_op.h"
-#include "paddle/operators/net_op.h"
-
-namespace paddle {
-namespace operators {
-using framework::Tensor;
-
-class SplitOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SplitOp should not be null.");
-    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
-                      "Outputs(Out) of SplitOp should not be empty.");
-    auto in_dims = ctx->GetInputDim("X");
-    auto outs_names = ctx->Outputs("Out");
-    size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
-    size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
-    std::vector<int> sections = static_cast<std::vector<int>>(
-        ctx->Attrs().Get<std::vector<int>>("sections"));
-    const size_t outs_number = outs_names.size();
-    std::vector<framework::DDim> outs_dims;
-    outs_dims.reserve(outs_number);
-
-    if (num > 0) {
-      int64_t in_axis_dim = in_dims[axis];
-      PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
-                        "tensor split does not result"
-                        " in an equal division");
-      size_t out_axis_dim = in_axis_dim / num;
-      for (size_t i = 0; i < outs_number; ++i) {
-        auto dim = in_dims;
-        dim[axis] = out_axis_dim;
-        outs_dims.push_back(dim);
-      }
-    } else if (sections.size() > 0) {
-      PADDLE_ENFORCE_EQ(sections.size(), outs_number,
-                        "tensor split sections size"
-                        "should be equal to output size.");
-      for (size_t i = 0; i < outs_number; ++i) {
-        auto dim = in_dims;
-        dim[axis] = sections[i];
-        outs_dims.push_back(dim);
-      }
-    }
-    ctx->SetOutputsDim("Out", outs_dims);
-    if (axis != 0) {
-      // Only pass LoD when not spliting along the first dim.
-      for (size_t i = 0; i < outs_number; ++i) {
-        ctx->ShareLoD("X", "Out", 0, i);
-      }
-    }
-  }
-};
-
-class SplitOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SplitOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) Input tensor of the split operator.");
-    AddOutput("Out", "(Tensor) Output tensors of the split operator.")
-        .AsDuplicable();
-    AddComment(R"DOC(
-Split operator
-
-This operator splits the input tensor into multiple sub-tensors.
-
-Example:
-  Input = [[1,2],
-           [3,4],
-           [5,6]]
-  sections = [2,1]
-  axis = 0
-  Output[0] = [[1,2],
-               [3,4]]
-  Output[1] = [[5,6]]
-
-    )DOC");
-    AddAttr<std::vector<int>>("sections",
-                              "(vector<int>) "
-                              "the length of each output along the "
-                              "specified axis.")
-        .SetDefault(std::vector<int>{});
-    AddAttr<int>("num",
-                 "(int, default 0)"
-                 "Number of sub-tensors. This must evenly divide "
-                 "Input.dims()[axis]")
-        .SetDefault(0);
-    AddAttr<int>("axis",
-                 "(int, default 0) "
-                 "The axis which the input will be splited on.")
-        .SetDefault(0);
-  }
-};
-
-class SplitGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto op = new framework::OpDesc();
-    op->SetType("concat");
-    op->SetInput("X", OutputGrad("Out"));
-    op->SetOutput("Out", InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-USE_CPU_ONLY_OP(concat);
-
-REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker);
-REGISTER_OP_CPU_KERNEL(split,
-                       ops::SplitOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/split_op.cu.cc b/paddle/operators/split_op.cu.cc
deleted file mode 100644
index dbad0bbf68d7924cfba80721bb3294b7e0cfac00..0000000000000000000000000000000000000000
--- a/paddle/operators/split_op.cu.cc
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/split_op.h"
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    split, ops::SplitOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/split_op.h b/paddle/operators/split_op.h
deleted file mode 100644
index a38c435d531d7da2a1a60eb2c455bc1782c1cd4c..0000000000000000000000000000000000000000
--- a/paddle/operators/split_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/strided_memcpy.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SplitOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    auto in_stride = framework::stride(in->dims());
-    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
-    const size_t n = outs.size();
-    size_t input_offset = 0;
-    for (size_t i = 0; i < n; i++) {
-      auto& out = outs[i];
-      out->mutable_data<T>(ctx.GetPlace());
-      size_t axis_dim = out->dims()[axis];
-      auto out_stride = framework::stride(out->dims());
-      StridedMemcpy<T>(ctx.device_context(), in->data<T>() + input_offset,
-                       in_stride, out->dims(), out_stride, out->data<T>());
-      input_offset += axis_dim * in_stride[axis];
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/split_selected_rows_op.cc b/paddle/operators/split_selected_rows_op.cc
deleted file mode 100644
index 0515ea13aadbd6e12e2586c937f3d1ed0a298d69..0000000000000000000000000000000000000000
--- a/paddle/operators/split_selected_rows_op.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/split_selected_rows_op.h"
-
-namespace paddle {
-namespace operators {
-
-class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SplitSelectedRowsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input SelectedRows.");
-    AddOutput("Out", "The outputs of input SelectedRows.").AsDuplicable();
-    AddAttr<std::vector<int>>("height_sections",
-                              "Height for each output SelectedRows.")
-        .SetDefault(std::vector<int>({}));
-
-    AddComment(R"DOC(
-Split a SelectedRows with a specified rows section.
-height_sections is only needed when need to split the dims of the original tensor.
-
-Example:
-  Input:
-    X.rows = {7, 5}
-    X.height = 12
-  Attr:
-    height_sections = {4, 8}
-  Out:
-    out0.rows = {}
-    out0.height = 4
-
-    out1.rows = {5, 7}
-    out2.height = 8
-
-)DOC");
-  }
-};
-
-class SplitSelectedRowsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "SplitSelectedRowsOp must has input X.");
-    PADDLE_ENFORCE(ctx->HasOutputs("Out"),
-                   "SplitSelectedRowsOp must has output Out.");
-
-    std::vector<int> height_sections =
-        ctx->Attrs().Get<std::vector<int>>("height_sections");
-    int64_t n = ctx->Outputs("Out").size();
-
-    std::vector<framework::DDim> outs_dims;
-    outs_dims.reserve(n);
-
-    // make output dims
-    for (int64_t i = 0; i < n; ++i) {
-      auto dims = ctx->GetInputDim("X");
-      if (height_sections.size()) {
-        PADDLE_ENFORCE_EQ(
-            height_sections.size(), static_cast<size_t>(n),
-            "The size of height section should be the same with height"
-            " section size.");
-        dims[0] = height_sections[i];
-      }
-      outs_dims.push_back(dims);
-    }
-    ctx->SetOutputsDim("Out", outs_dims);
-  }
-};
-
-class SplitSelectedRowsGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("sum");
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(split_selected_rows, ops::SplitSelectedRowsOp,
-                  ops::SplitSelectedRowsOpMaker,
-                  ops::SplitSelectedRowsGradMaker);
-REGISTER_OP_CPU_KERNEL(
-    split_selected_rows,
-    ops::SplitSelectedRowsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/split_selected_rows_op.cu b/paddle/operators/split_selected_rows_op.cu
deleted file mode 100644
index 983285480fd9de7a2a4d2787a9bba72c160b7fae..0000000000000000000000000000000000000000
--- a/paddle/operators/split_selected_rows_op.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/split_selected_rows_op.h"
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    split_selected_rows,
-    ops::SplitSelectedRowsOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/split_selected_rows_op.h b/paddle/operators/split_selected_rows_op.h
deleted file mode 100644
index 12e64e2901e5902d187d65a12c94b8d7ef45a481..0000000000000000000000000000000000000000
--- a/paddle/operators/split_selected_rows_op.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-static int FindOutIdx(int row, const std::vector<int>& height_sections) {
-  int offset = 0;
-  for (size_t i = 0; i < height_sections.size(); ++i) {
-    if (row >= offset && row < (offset + height_sections[i])) {
-      return i;
-    }
-    offset += height_sections[i];
-  }
-  return -1;
-}
-
-template <typename DeviceContext, typename T>
-class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::SelectedRows>("X");
-    auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
-    auto height_sections = ctx.Attr<std::vector<int>>("height_sections");
-
-    auto x_rows = x->rows();
-    std::vector<std::vector<int>> outs_rows_idx;
-    outs_rows_idx.resize(outs.size());
-
-    auto row_numel = x->value().numel() / x->value().dims()[0];
-    auto src = x->value().data<T>();
-
-    for (size_t i = 0; i < x_rows.size(); ++i) {
-      int out_idx = FindOutIdx(x_rows[i], height_sections);
-      outs_rows_idx[out_idx].push_back(i);
-    }
-    auto place = ctx.GetPlace();
-
-    for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
-      auto rows_idx = outs_rows_idx[i];
-      if (rows_idx.size() > 0) {
-        auto dims = x->GetCompleteDims();
-        dims[0] = rows_idx.size();
-        outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
-        for (auto idx : rows_idx) {
-          outs[i]->mutable_rows()->push_back(x_rows[idx]);
-        }
-        auto dst = outs[i]->mutable_value()->mutable_data<T>(ctx.GetPlace());
-        for (size_t j = 0; j < rows_idx.size(); j++) {
-          if (platform::is_cpu_place(place)) {
-            memory::Copy(platform::CPUPlace(), dst + j * row_numel,
-                         platform::CPUPlace(), src + rows_idx[j] * row_numel,
-                         sizeof(T) * row_numel);
-          } else {
-#ifdef PADDLE_WITH_CUDA
-            auto stream = ctx.cuda_device_context().stream();
-            memory::Copy(platform::CUDAPlace(), dst + j * row_numel,
-                         platform::CUDAPlace(), src + rows_idx[j] * row_numel,
-                         sizeof(T) * row_numel, stream);
-#else
-            PADDLE_THROW("Paddle is not compiled with GPU");
-#endif
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/spp_op.cc b/paddle/operators/spp_op.cc
deleted file mode 100644
index c0aa87b0f06ca9c7d156dfdf8df188da68ac1450..0000000000000000000000000000000000000000
--- a/paddle/operators/spp_op.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/spp_op.h"
-namespace paddle {
-namespace operators {
-
-class SppOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SppOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor of spp operator. "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
-        "number of channels, H and W is the height and width of feature.");
-    AddOutput("Out",
-              "(Tensor) The output tensor of spp operator."
-              "N * M."
-              "M = C * H * W");
-    AddAttr<int>("pyramid_height", "(int), multi level pooling");
-    AddAttr<std::string>(
-        "pooling_type",
-        "(string), pooling type, can be \"max\" for max-pooling "
-        "and \"avg\" for average-pooling.")
-        .InEnum({"max", "avg"});
-    AddComment(R"DOC(
-        "With spatial pyramid pooling, the input image can
-        be of any sizes. This not only allows arbitrary aspect
-        ratios, but also allows arbitrary scales. We can resize
-        the input image to any scale (e.g., min(w, h)=180, 224,
-        ...) and apply the same deep network. When the
-        input image is at different scales, the network (with
-        the same filter sizes) will extract features at different
-        scales. The scales play important roles in traditional
-        methods.
-        Input shape: $(N, C_{in}, H_{in}, W_{in})$
-        Output shape: $(H_{out}, W_{out})$
-        Where
-          $$
-            H_{out} = N \\
-            W_{out} = (((4^pyramid_height) - 1) / (4 - 1))$ * C_{in}
-          $$
-        paper https://arxiv.org/pdf/1406.4729v4.pdf
-        )DOC");
-  }
-};
-
-class SppOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SppOp"
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SppOp should not be null.");
-    auto in_x_dims = ctx->GetInputDim("X");
-    int pyramid_height = ctx->Attrs().Get<int>("pyramid_height");
-    PADDLE_ENFORCE(in_x_dims.size() == 4,
-                   "Spping intput must be of 4-dimensional.");
-    int outlen = ((std::pow(4, pyramid_height) - 1) / (4 - 1)) * in_x_dims[1];
-    std::vector<int64_t> output_shape({in_x_dims[0], outlen});
-    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-  }
-};
-
-class SppOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Input(X@GRAD) should not be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(spp, ops::SppOp, ops::SppOpMaker, spp_grad, ops::SppOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    spp, ops::SppKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SppKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    spp_grad, ops::SppGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SppGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/spp_op.cu.cc b/paddle/operators/spp_op.cu.cc
deleted file mode 100644
index 761e4d6c4a9639898ba548d56bed3c8817720c1b..0000000000000000000000000000000000000000
--- a/paddle/operators/spp_op.cu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/spp_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    spp, ops::SppKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SppKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    spp_grad, ops::SppGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SppGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/spp_op.h b/paddle/operators/spp_op.h
deleted file mode 100644
index f35b305d02c73bcae6e72b8afa5ce55148ea98b8..0000000000000000000000000000000000000000
--- a/paddle/operators/spp_op.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/pooling.h"
-#include "paddle/operators/strided_memcpy.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class SppKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    int pyramid_height = context.template Attr<int>("pyramid_height");
-    std::string pooling_type =
-        context.template Attr<std::string>("pooling_type");
-    out->mutable_data<T>(context.GetPlace());
-    auto out_stride = framework::stride(out->dims());
-    int input_h = in_x->dims()[2];
-    int input_w = in_x->dims()[3];
-    size_t output_offset = 0;
-    for (int p = 0; p < pyramid_height; ++p) {
-      int bins = std::pow(2, p);
-      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
-      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
-      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
-      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
-      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
-      std::vector<int> strides({kernel_size_h, kernel_size_w});
-      std::vector<int> paddings({padding_h, padding_w});
-      // pooling output shape
-      framework::Tensor out_level;
-      std::vector<int64_t> output_shape_vec(
-          {in_x->dims()[0], in_x->dims()[1], bins, bins});
-      framework::DDim output_shape(framework::make_ddim(output_shape_vec));
-      out_level.mutable_data<T>(output_shape, context.GetPlace());
-      // pooling
-      if (pooling_type == "max") {
-        math::Pool2dFunctor<DeviceContext, math::MaxPool<T>, T> pool_forward;
-        math::MaxPool<T> max_process;
-        pool_forward(context.template device_context<DeviceContext>(), *in_x,
-                     kernel_size, strides, paddings, max_process, &out_level);
-      } else if (pooling_type == "avg") {
-        math::Pool2dFunctor<DeviceContext, math::AvgPool<T>, T> pool_forward;
-        math::AvgPool<T> avg_process;
-        pool_forward(context.template device_context<DeviceContext>(), *in_x,
-                     kernel_size, strides, paddings, avg_process, &out_level);
-      }
-      // flatten pooling output shape
-      int output_flatten_w = in_x->dims()[1] * bins * bins;
-      std::vector<int64_t> output_flatten_shape_vec(
-          {in_x->dims()[0], output_flatten_w});
-      framework::DDim output_flatten_shape(
-          framework::make_ddim(output_flatten_shape_vec));
-      out_level.Resize(output_flatten_shape);
-      // concat
-      auto out_level_stride = framework::stride(out_level.dims());
-      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
-                       out_level.data<T>(), out_level_stride, out_level.dims(),
-                       out_stride, out->data<T>() + output_offset);
-      output_offset += out_level.dims()[1] * out_level_stride[1];
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class SppGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
-    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
-    const framework::Tensor* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    framework::Tensor* in_x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    int pyramid_height = context.template Attr<int>("pyramid_height");
-    std::string pooling_type =
-        context.template Attr<std::string>("pooling_type");
-    auto& device_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
-    in_x_grad->mutable_data<T>(context.GetPlace());
-    zero(device_ctx, in_x_grad, static_cast<T>(0));
-    auto out_stride = framework::stride(out->dims());
-    int input_h = in_x->dims()[2];
-    int input_w = in_x->dims()[3];
-    size_t out_offset = 0;
-    for (int p = 0; p < pyramid_height; ++p) {
-      int bins = std::pow(2, p);
-      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
-      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
-      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
-      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
-      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
-      std::vector<int> strides({kernel_size_h, kernel_size_w});
-      std::vector<int> paddings({padding_h, padding_w});
-      // split out and outgrad  ...  to flatten
-      framework::Tensor out_level;
-      framework::Tensor outgrad_level;
-      int out_flatten_w = in_x->dims()[1] * bins * bins;
-      std::vector<int64_t> out_flatten_shape_vec(
-          {in_x->dims()[0], out_flatten_w});
-      framework::DDim out_flatten_shape(
-          framework::make_ddim(out_flatten_shape_vec));
-      out_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
-      outgrad_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
-      auto flatten_stride = framework::stride(out_level.dims());
-      // memcpy
-      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
-                       out->data<T>() + out_offset, out_stride,
-                       out_level.dims(), flatten_stride, out_level.data<T>());
-
-      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
-                       out_grad->data<T>() + out_offset, out_stride,
-                       outgrad_level.dims(), flatten_stride,
-                       outgrad_level.data<T>());
-      out_offset += out_level.dims()[1] * out_stride[1];
-      // flatten backward to nchw
-
-      std::vector<int64_t> out_shape_vec({in_x->dims()[0], in_x->dims()[1]});
-      out_shape_vec.push_back(
-          (input_h - kernel_size_h + 2 * padding_h) / kernel_size_h + 1);
-      out_shape_vec.push_back(
-          (input_w - kernel_size_w + 2 * padding_w) / kernel_size_w + 1);
-      framework::DDim out_shape(framework::make_ddim(out_shape_vec));
-      out_level.ShareDataWith(out_level);
-      out_level.Resize(out_shape);
-      outgrad_level.ShareDataWith(outgrad_level);
-      outgrad_level.Resize(out_shape);
-      // pooling backward
-      if (pooling_type == "max") {
-        math::MaxPool2dGradFunctor<DeviceContext, T> pool2d_backward;
-        pool2d_backward(context.template device_context<DeviceContext>(), *in_x,
-                        *&out_level, *&outgrad_level, kernel_size, strides,
-                        paddings, in_x_grad);
-      } else if (pooling_type == "avg") {
-        math::Pool2dGradFunctor<DeviceContext, math::AvgPoolGrad<T>, T>
-            pool_backward;
-        math::AvgPoolGrad<T> avg_process;
-        pool_backward(context.template device_context<DeviceContext>(), *in_x,
-                      *&out_level, *&outgrad_level, kernel_size, strides,
-                      paddings, avg_process, in_x_grad);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc
deleted file mode 100644
index 9e097176f3434e81e31f2ecf4093af47b654e816..0000000000000000000000000000000000000000
--- a/paddle/operators/squared_l2_distance_op.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/squared_l2_distance_op.h"
-
-namespace paddle {
-namespace operators {
-
-class SquaredL2DistanceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SquaredL2DistanceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"),
-                   "Input(Y) of SquaredL2DistanceOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("sub_result"),
-        "Output(sub_result) of SquaredL2DistanceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SquaredL2DistanceOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_EQ(framework::arity(x_dims), framework::arity(y_dims),
-                      "Tensor rank of both SquaredL2DistanceOp's "
-                      "inputs must be same.");
-
-    int rank = framework::arity(x_dims);
-    PADDLE_ENFORCE_GE(rank, 2, "Tensor rank should be at least equal to 2.");
-    PADDLE_ENFORCE_EQ(product(x_dims) / x_dims[0], product(y_dims) / y_dims[0],
-                      "Product of dimensions expcet the first dimension of "
-                      "input and target must be equal.");
-    PADDLE_ENFORCE(y_dims[0] == 1 || y_dims[0] == x_dims[0],
-                   "First dimension of target must be equal to input "
-                   "or to 1.");
-
-    ctx->SetOutputDim("sub_result", {x_dims[0], product(x_dims) / x_dims[0]});
-    ctx->SetOutputDim("Out", {x_dims[0], 1});
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SquaredL2DistanceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) Input of SquaredL2DistanceOp.");
-    AddInput("Y", "(Tensor) Target of SquaredL2DistanceOp.");
-    AddOutput("sub_result",
-              "(Tensor) Buffering subtraction result which "
-              "will be reused in backward.")
-        .AsIntermediate();
-    AddOutput("Out", "(Tensor) Squared l2 distance between input and target.");
-    AddComment(R"DOC(
-SquaredL2Distance operator
-
-This operator will cacluate the squared L2 distance for the input and 
-the target. Number of distance value will be equal to the first dimension 
-of input. First dimension of the target could be equal to the input or to 1. 
-If the first dimension of target is 1, the operator will broadcast target's 
-first dimension to input's first dimension. During backward propagation, 
-the user can decide whether to calculate the gradient of the input or 
-the target or both.
-
-Both the input X and Y can carry the LoD (Level of Details) information. 
-However, the output only shares the LoD information with input X.
-    )DOC");
-  }
-};
-
-class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Gradient of Out should not be null");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(out_dims[0], x_dims[0],
-                      "First dimension of output gradient and "
-                      "input value must be equal.");
-    PADDLE_ENFORCE_EQ(out_dims[1], 1,
-                      "Second dimension of output gradient "
-                      "must be 1.");
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(x_grad_name)) ctx->SetOutputDim(x_grad_name, x_dims);
-    if (ctx->HasOutput(y_grad_name)) ctx->SetOutputDim(y_grad_name, y_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(squared_l2_distance, ops::SquaredL2DistanceOp,
-            ops::SquaredL2DistanceOpMaker, squared_l2_distance_grad,
-            ops::SquaredL2DistanceGradOp);
-REGISTER_OP_CPU_KERNEL(
-    squared_l2_distance,
-    ops::SquaredL2DistanceKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(squared_l2_distance_grad,
-                       ops::SquaredL2DistanceGradKernel<
-                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/squared_l2_distance_op.cu b/paddle/operators/squared_l2_distance_op.cu
deleted file mode 100644
index f2648dde5eb9c56aed3fad81521e6207dc2d973e..0000000000000000000000000000000000000000
--- a/paddle/operators/squared_l2_distance_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-
-#include "paddle/operators/squared_l2_distance_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    squared_l2_distance,
-    ops::SquaredL2DistanceKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(squared_l2_distance_grad,
-                        ops::SquaredL2DistanceGradKernel<
-                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/squared_l2_distance_op.h b/paddle/operators/squared_l2_distance_op.h
deleted file mode 100644
index 5bd5f4819a35966b73038f433d38c06031e18715..0000000000000000000000000000000000000000
--- a/paddle/operators/squared_l2_distance_op.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class SquaredL2DistanceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto* in1 = context.Input<Tensor>("Y");
-    auto* out0 = context.Output<Tensor>("sub_result");
-    auto* out1 = context.Output<Tensor>("Out");
-
-    auto in0_dims = in0->dims();
-    auto in1_dims = in1->dims();
-
-    int cols = in0->numel() / in0_dims[0];
-    // reduce dimensions except the first
-    auto x =
-        EigenMatrix<T>::From(*in0, framework::make_ddim({in0_dims[0], cols}));
-    auto y =
-        EigenMatrix<T>::From(*in1, framework::make_ddim({in1_dims[0], cols}));
-
-    out0->mutable_data<T>(context.GetPlace());
-    out1->mutable_data<T>(context.GetPlace());
-    auto sub_result = EigenMatrix<T>::From(*out0);
-    auto z = EigenVector<T>::Flatten(*out1);
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto x_dims = x.dimensions();
-    auto y_dims = y.dimensions();
-    // buffer the substraction result
-    if (y_dims[0] == 1 && x_dims[0] > y_dims[0]) {
-      sub_result.device(place) =
-          x -
-          y.broadcast(Eigen::array<int, 2>({{static_cast<int>(x_dims[0]), 1}}));
-    } else {
-      sub_result.device(place) = x - y;
-    }
-    auto sub_res_pow2 = sub_result * sub_result;
-    z.device(place) = sub_res_pow2.sum(Eigen::array<int, 1>({{1}}));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("sub_result");
-    auto* in1 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x_g = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* y_g = context.Output<Tensor>(framework::GradVarName("Y"));
-
-    auto sub_result = EigenMatrix<T>::From(*in0);
-    auto out_grad = EigenMatrix<T>::From(*in1);
-
-    auto x_dims = x_g->dims();
-    auto y_dims = y_g->dims();
-
-    int cols = x_g->numel() / x_dims[0];
-    // calculate gradient
-    auto grad_mat = 2 *
-                    (out_grad.broadcast(Eigen::array<int, 2>({{1, cols}}))) *
-                    sub_result;
-
-    // propagate back to input
-    auto& eigen_place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    if (x_g) {
-      x_g->mutable_data<T>(context.GetPlace());
-      // eigen matrix
-      auto x_grad =
-          EigenMatrix<T>::From(*x_g, framework::make_ddim({x_dims[0], cols}));
-      // dimensions are same with subResult
-      x_grad.device(eigen_place) = grad_mat;
-    }
-
-    if (y_g) {
-      y_g->mutable_data<T>(context.GetPlace());
-
-      PADDLE_ENFORCE_GE(sub_result.dimensions()[0], y_dims[0],
-                        "First dimension of gradient must be greater or "
-                        "equal than first dimension of target.");
-
-      if (sub_result.dimensions()[0] == y_dims[0]) {
-        auto y_grad =
-            EigenMatrix<T>::From(*y_g, framework::make_ddim({y_dims[0], cols}));
-        y_grad.device(eigen_place) = -1 * grad_mat;
-      } else {
-        auto col_sum_res = -1 * (grad_mat.sum(Eigen::array<int, 1>({{0}})));
-        auto y_grad = EigenVector<T>::Flatten(*y_g);
-        y_grad.device(eigen_place) = col_sum_res;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/squared_l2_norm_op.cc b/paddle/operators/squared_l2_norm_op.cc
deleted file mode 100644
index 6626bf0375548eac457f960105ce33e63e1a3706..0000000000000000000000000000000000000000
--- a/paddle/operators/squared_l2_norm_op.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/squared_l2_norm_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class SquaredL2NormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
-
-    ctx->SetOutputDim("Out", {1});
-  }
-};
-
-class SquaredL2NormGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should be not null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output(X@GRAD) should be not null.");
-
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-
-class SquaredL2NormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SquaredL2NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) The input of squared_l2_norm op.");
-    AddOutput("Out", "(Scalar) The output of squared_l2_norm op.");
-    AddComment(R"DOC(
-SquaredL2Norm Operator.
-
-Computes the squared L2 norm of a tensor.
-
-$$Out = \sum_{i} X_{i}^2$$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(squared_l2_norm, ops::SquaredL2NormOp, ops::SquaredL2NormOpMaker,
-            squared_l2_norm_grad, ops::SquaredL2NormGradOp);
-REGISTER_OP_CPU_KERNEL(
-    squared_l2_norm,
-    ops::SquaredL2NormKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    squared_l2_norm_grad,
-    ops::SquaredL2NormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/squared_l2_norm_op.cu b/paddle/operators/squared_l2_norm_op.cu
deleted file mode 100644
index b222113a8c82061bd841da440a714b66f6c1fb9c..0000000000000000000000000000000000000000
--- a/paddle/operators/squared_l2_norm_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/squared_l2_norm_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    squared_l2_norm,
-    ops::SquaredL2NormKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    squared_l2_norm_grad,
-    ops::SquaredL2NormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/squared_l2_norm_op.h b/paddle/operators/squared_l2_norm_op.h
deleted file mode 100644
index 1ce26c775ed5700cee73d00f4c51d58a692a1152..0000000000000000000000000000000000000000
--- a/paddle/operators/squared_l2_norm_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-// Out = sum(square(X))
-template <typename DeviceContext, typename T>
-class SquaredL2NormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
-    Out->mutable_data<T>(context.GetPlace());
-
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto out = framework::EigenScalar<T>::From(*Out);
-    auto *place =
-        context.template device_context<DeviceContext>().eigen_device();
-
-    out.device(*place) = x.square().sum();
-  }
-};
-
-// dX = X
-template <typename DeviceContext, typename T>
-class SquaredL2NormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    const framework::Tensor *dOut =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE(dOut->numel() == 1,
-                   "Squared L2 Norm Gradient should be scalar");
-    framework::Tensor *dX =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    dX->mutable_data<T>(context.GetPlace());
-
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto dout = framework::EigenVector<T>::Flatten(*dOut);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto *place =
-        context.template device_context<DeviceContext>().eigen_device();
-
-    Eigen::DSizes<int, 1> x_dsize(X->numel());
-    dx.device(*place) = (dout.broadcast(x_dsize) * x) * static_cast<T>(2.0);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/strided_memcpy.h b/paddle/operators/strided_memcpy.h
deleted file mode 100644
index 735cabcd973a3b5ea1ab8ab57091eae14e23b89b..0000000000000000000000000000000000000000
--- a/paddle/operators/strided_memcpy.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/operators/detail/strided_memcpy.h"
-
-namespace paddle {
-namespace operators {
-
-// Strided memory copy from src to dst.
-//
-// The src and dst should be both on dev_ctx.GetPlace(), otherwise, there will
-// be a segment fault.
-//
-// The stride of an array (also referred to as increment, pitch or step size) is
-// the number of locations in memory between beginnings of successive array
-// elements
-//
-// For example, for tensor like [1, 3, 300, 300]. If there is no padding, the
-// stride is [270000, 90000, 300, 1].
-//
-// NOTE: When use GPU, the memcpy is async. To sync memcpy, please invoke
-// `dev_ctx.Wait()`.
-template <typename T>
-inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
-                          const framework::DDim& src_stride,
-                          const framework::DDim& dst_dim,
-                          const framework::DDim& dst_stride, T* dst) {
-  using namespace detail;
-  StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
-  boost::apply_visitor(func, dst_dim);
-}
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/strided_memcpy_test.cc b/paddle/operators/strided_memcpy_test.cc
deleted file mode 100644
index 06d81188558aad85c41c56ddefad3617d48da74c..0000000000000000000000000000000000000000
--- a/paddle/operators/strided_memcpy_test.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/strided_memcpy.h"
-#include "gtest/gtest.h"
-#include "paddle/memory/memory.h"
-
-namespace paddle {
-namespace operators {
-
-TEST(StridedMemcpy, CPUCrop) {
-  // clang-format off
-  int src[] = {
-      0, 1, 2, 0, 0,
-      0, 3, 4, 0, 0,
-      0, 0, 0, 0, 0,
-  };
-  // clang-format on
-
-  framework::DDim src_stride({5, 1});
-
-  int dst[4];
-  framework::DDim dst_dim({2, 2});
-  framework::DDim dst_stride({2, 1});
-
-  platform::CPUDeviceContext ctx;
-  StridedMemcpy<int>(ctx, src + 1, src_stride, dst_dim, dst_stride, dst);
-
-  ASSERT_EQ(1, dst[0]);
-  ASSERT_EQ(2, dst[1]);
-  ASSERT_EQ(3, dst[2]);
-  ASSERT_EQ(4, dst[3]);
-}
-
-TEST(StridedMemcpy, CPUConcat) {
-  // clang-format off
-  int src[] = {
-      1, 2,
-      3, 4
-  };
-  // clang-format on
-
-  int dst[8];
-
-  framework::DDim src_stride({2, 1});
-  framework::DDim dst_dim({2, 2});
-  framework::DDim dst_stride({4, 1});
-  platform::CPUDeviceContext ctx;
-
-  StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst);
-  StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst + 2);
-
-  // clang-format off
-  int expect_dst[] = {
-      1, 2, 1, 2,
-      3, 4, 3, 4
-  };
-  // clang-format on
-  for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) {
-    ASSERT_EQ(expect_dst[i], dst[i]);
-  }
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(StridedMemcpy, GPUCrop) {
-  // clang-format off
-  int src[] = {
-      0, 1, 2, 0, 0,
-      0, 3, 4, 0, 0,
-      0, 0, 0, 0, 0,
-  };
-  // clang-format on
-
-  platform::CUDAPlace gpu0(0);
-  platform::CPUPlace cpu;
-
-  platform::CUDADeviceContext ctx(gpu0);
-
-  int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
-  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
-
-  framework::DDim src_stride({5, 1});
-
-  int dst[4];
-  int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
-
-  framework::DDim dst_dim({2, 2});
-  framework::DDim dst_stride({2, 1});
-
-  StridedMemcpy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride,
-                     gpu_dst);
-
-  memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
-  ctx.Wait();
-
-  ASSERT_EQ(1, dst[0]);
-  ASSERT_EQ(2, dst[1]);
-  ASSERT_EQ(3, dst[2]);
-  ASSERT_EQ(4, dst[3]);
-
-  memory::Free(gpu0, gpu_dst);
-  memory::Free(gpu0, gpu_src);
-}
-
-TEST(StridedMemcpy, GPUConcat) {
-  // clang-format off
-  int src[] = {
-      1, 2,
-      3, 4
-  };
-  // clang-format on
-
-  platform::CUDAPlace gpu0(0);
-  platform::CPUPlace cpu;
-  platform::CUDADeviceContext ctx(gpu0);
-
-  int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
-  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
-
-  int dst[8];
-  int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
-
-  framework::DDim src_stride({2, 1});
-  framework::DDim dst_dim({2, 2});
-  framework::DDim dst_stride({4, 1});
-
-  StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
-  StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride,
-                     gpu_dst + 2);
-
-  memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
-  ctx.Wait();
-
-  // clang-format off
-  int expect_dst[] = {
-      1, 2, 1, 2,
-      3, 4, 3, 4
-  };
-  // clang-format on
-  for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) {
-    ASSERT_EQ(expect_dst[i], dst[i]);
-  }
-
-  memory::Free(gpu0, gpu_dst);
-  memory::Free(gpu0, gpu_src);
-}
-
-#endif
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
deleted file mode 100644
index 88ed67f7ba2527e1d91e6bb30762d5dcf818761d..0000000000000000000000000000000000000000
--- a/paddle/operators/sum_op.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/sum_op.h"
-#include <vector>
-#include "paddle/framework/var_type_inference.h"
-#include "paddle/operators/detail/safe_ref.h"
-
-namespace paddle {
-namespace operators {
-using framework::Tensor;
-
-class SumOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInputs("X"), "Inputs(X) should not be null");
-
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SumOp should not be null.");
-    if (ctx->IsRuntime() &&
-        ctx->GetOutputsVarType("Out")[0] ==
-            framework::proto::VarDesc::LOD_TENSOR_ARRAY) {
-      return;  // skip runtime infershape when is tensor array;
-    }
-
-    auto x_dims = ctx->GetInputsDim("X");
-    size_t N = x_dims.size();
-    PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
-
-    framework::DDim in_dim({0});
-    for (auto& x_dim : x_dims) {
-      if (framework::product(x_dim) == 0) {
-        continue;
-      }
-      if (framework::product(in_dim) == 0) {
-        in_dim = x_dim;
-      } else {
-        PADDLE_ENFORCE_EQ(in_dim, x_dim, "Input tensors must have same shape");
-      }
-    }
-    ctx->SetOutputDim("Out", in_dim);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto x_vars = ctx.MultiInputVar("X");
-    if (x_vars[0]->IsType<framework::LoDTensor>()) {
-      int dtype = -1;
-      for (auto& x_var : x_vars) {
-        auto& lod_tensor = x_var->Get<framework::LoDTensor>();
-        if (lod_tensor.numel() == 0) {
-          continue;
-        }
-        if (dtype == -1) {
-          dtype = framework::ToDataType(lod_tensor.type());
-        } else {
-          PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(lod_tensor.type()));
-        }
-      }
-      PADDLE_ENFORCE_NE(dtype, -1,
-                        "Sum operator should have at least one tensor");
-
-      return framework::OpKernelType(
-          static_cast<framework::proto::DataType>(dtype), ctx.device_context());
-    } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
-      return framework::OpKernelType(
-          framework::ToDataType(
-              x_vars[0]->Get<framework::SelectedRows>().value().type()),
-          ctx.device_context());
-    } else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
-      for (auto& x_var : x_vars) {
-        auto& array = x_var->Get<framework::LoDTensorArray>();
-        for (auto& each : array) {
-          if (each.numel() != 0) {
-            return framework::OpKernelType(framework::ToDataType(each.type()),
-                                           ctx.device_context());
-          }
-        }
-      }
-      PADDLE_THROW("Cannot find the input data type by all input data");
-    }
-    PADDLE_THROW("Unexpected branch. Input type is %s",
-                 x_vars[0]->Type().name());
-  }
-};
-
-class SumOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SumOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(vector<Tensor>) The input tensors of sum operator.")
-        .AsDuplicable();
-    AddOutput("Out", "(Tensor) The output tensor of sum operator.");
-    AddComment(R"DOC(
-Sum operator.
-
-This operators sums the input tensors. All the inputs can carry the
-LoD (Level of Details) information. However, the output only shares
-the LoD information with the first input.
-)DOC");
-  }
-};
-
-class SumOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto& inputs = op_desc.Input("X");
-    auto var_type = framework::proto::VarDesc::SELECTED_ROWS;
-
-    for (auto& name : op_desc.Input("X")) {
-      VLOG(10) << name << " "
-               << block->FindRecursiveOrCreateVar(name).GetType();
-    }
-
-    bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [block](const std::string& name) {
-          return block->FindRecursiveOrCreateVar(name).GetType() ==
-                 framework::proto::VarDesc::LOD_TENSOR;
-        });
-
-    auto is_tensor_array = [block](const std::string& name) {
-      return block->FindRecursiveOrCreateVar(name).GetType() ==
-             framework::proto::VarDesc::LOD_TENSOR_ARRAY;
-    };
-
-    bool any_input_is_tensor_array =
-        std::any_of(inputs.begin(), inputs.end(), is_tensor_array);
-    bool all_inputs_are_tensor_array =
-        std::all_of(inputs.begin(), inputs.end(), is_tensor_array);
-
-    if (any_input_is_tensor_array) {
-      if (!all_inputs_are_tensor_array) {
-        std::ostringstream os;
-        for (auto& each : inputs) {
-          os << "    " << each << " type is "
-             << block->FindRecursiveOrCreateVar(each).GetType() << "\n";
-        }
-        PADDLE_ENFORCE(all_inputs_are_tensor_array,
-                       "Not all inputs are tensor array:\n%s", os.str());
-      }
-      var_type = framework::proto::VarDesc::LOD_TENSOR_ARRAY;
-    } else if (any_input_is_lod_tensor) {
-      var_type = framework::proto::VarDesc::LOD_TENSOR;
-    }
-
-    auto out_var_name = op_desc.Output("Out").front();
-    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    out_var.SetType(var_type);
-    auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front()));
-    out_var.SetDataType(in_var.GetDataType());
-  }
-};
-
-class SumGradMaker : public framework::GradOpDescMakerBase {
- public:
-  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
-
-  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
-    auto x_grads = InputGrad("X", false);
-    std::vector<std::unique_ptr<framework::OpDesc>> grad_ops;
-    grad_ops.reserve(x_grads.size());
-    auto og = OutputGrad("Out");
-    std::transform(x_grads.begin(), x_grads.end(), std::back_inserter(grad_ops),
-                   [&og](const std::string& x_grad) {
-                     auto* grad_op = new framework::OpDesc();
-                     grad_op->SetType("scale");
-                     grad_op->SetInput("X", og);
-                     grad_op->SetOutput("Out", {x_grad});
-                     grad_op->SetAttr("scale", 1.0f);
-                     return std::unique_ptr<framework::OpDesc>(grad_op);
-                   });
-    return grad_ops;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker,
-                  ops::SumOpVarTypeInference);
-REGISTER_OP_CPU_KERNEL(
-    sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SumKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SumKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SumKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/sum_op.cu b/paddle/operators/sum_op.cu
deleted file mode 100644
index 873155076c179d5280a418e25fd39fdaf4b0a2b2..0000000000000000000000000000000000000000
--- a/paddle/operators/sum_op.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/sum_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sum, ops::SumKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h
deleted file mode 100644
index 3d8102c3ae20c8b714cd48b4fc78dc18a0cf89a7..0000000000000000000000000000000000000000
--- a/paddle/operators/sum_op.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using SelectedRows = framework::SelectedRows;
-using LoDTensor = framework::LoDTensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class SumKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto in_vars = context.MultiInputVar("X");
-    int N = in_vars.size();
-    auto out_var = context.OutputVar("Out");
-
-    bool in_place = out_var == in_vars[0];
-
-    if (out_var->IsType<framework::LoDTensor>()) {
-      auto *out = context.Output<LoDTensor>("Out");
-      if (!in_place) {
-        out->mutable_data<T>(context.GetPlace());
-      }
-      auto result = EigenVector<T>::Flatten(*out);
-      if (!in_place) {
-        math::SetConstant<DeviceContext, T> constant_functor;
-        constant_functor(context.template device_context<DeviceContext>(), out,
-                         0.0);
-      }
-
-      math::SelectedRowsAddToTensor<DeviceContext, T> functor;
-      auto &place =
-          *context.template device_context<DeviceContext>().eigen_device();
-      // If in_place, just skip the first tensor
-      for (int i = in_place ? 1 : 0; i < N; i++) {
-        if (in_vars[i]->IsType<framework::LoDTensor>()) {
-          auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
-          if (in_t.numel() == 0) {
-            continue;
-          }
-          auto in = EigenVector<T>::Flatten(in_t);
-          result.device(place) = result + in;
-        } else if (in_vars[i]->IsType<framework::SelectedRows>()) {
-          auto &in_t = in_vars[i]->Get<framework::SelectedRows>();
-          functor(context.template device_context<DeviceContext>(), in_t, out);
-        } else {
-          PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
-        }
-      }
-    } else if (out_var->IsType<framework::SelectedRows>()) {
-      std::unique_ptr<framework::SelectedRows> in0;
-      if (in_place) {
-        // If is in_place, we store the input[0] to in0
-        auto &in_sel0 = in_vars[0]->Get<SelectedRows>();
-        auto &rows = in_sel0.rows();
-#ifdef PADDLE_WITH_CUDA
-        std::vector<int64_t> rows_in_cpu;
-        rows_in_cpu.reserve(rows.size());
-        for (auto item : rows) {
-          rows_in_cpu.push_back(item);
-        }
-        in0.reset(new framework::SelectedRows(rows_in_cpu, in_sel0.height()));
-#else
-        in0.reset(new framework::SelectedRows(rows, in_sel0.height()));
-#endif
-        in0->mutable_value()->ShareDataWith(in_sel0.value());
-      }
-
-      auto get_selected_row = [&](size_t i) -> const SelectedRows & {
-        if (i == 0 && in0) {
-          return *in0.get();
-        } else {
-          return in_vars[i]->Get<SelectedRows>();
-        }
-      };
-
-      auto *out = context.Output<SelectedRows>("Out");
-      out->mutable_rows()->clear();
-      auto *out_value = out->mutable_value();
-
-      // Runtime InferShape
-      size_t first_dim = 0;
-      for (int i = 0; i < N; i++) {
-        auto &sel_row = get_selected_row(i);
-        first_dim += sel_row.rows().size();
-      }
-      auto in_dim =
-          framework::vectorize(get_selected_row(N - 1).value().dims());
-      in_dim[0] = static_cast<int64_t>(first_dim);
-
-      out_value->Resize(framework::make_ddim(in_dim));
-      out_value->mutable_data<T>(context.GetPlace());
-
-      math::SelectedRowsAddTo<DeviceContext, T> functor;
-
-      int64_t offset = 0;
-      for (int i = 0; i < N; i++) {
-        auto &sel_row = get_selected_row(i);
-
-        PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
-        functor(context.template device_context<DeviceContext>(), sel_row,
-                offset, out);
-        offset += sel_row.value().numel();
-      }
-    } else if (out_var->IsType<framework::LoDTensorArray>()) {
-      auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
-      for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
-        PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensorArray>(),
-                       "Only support all inputs are TensorArray");
-        auto &in_array = in_vars[i]->Get<framework::LoDTensorArray>();
-
-        for (size_t i = 0; i < in_array.size(); ++i) {
-          if (in_array[i].numel() != 0) {
-            if (i >= out_array.size()) {
-              out_array.resize(i + 1);
-            }
-            if (out_array[i].numel() == 0) {
-              framework::Copy(in_array[i], in_array[i].place(),
-                              context.device_context(), &out_array[i]);
-              out_array[i].set_lod(in_array[i].lod());
-            } else {
-              PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
-              auto in = EigenVector<T>::Flatten(in_array[i]);
-              auto result = EigenVector<T>::Flatten(out_array[i]);
-              result.device(*context.template device_context<DeviceContext>()
-                                 .eigen_device()) = result + in;
-            }
-          }
-        }
-      }
-    } else {
-      PADDLE_THROW("Unexpected branch, output variable type is %s",
-                   out_var->Type().name());
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/target_assign_op.cc b/paddle/operators/target_assign_op.cc
deleted file mode 100644
index 615ca857ceb45d442b75fffc6662cc2bda19562d..0000000000000000000000000000000000000000
--- a/paddle/operators/target_assign_op.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/target_assign_op.h"
-
-namespace paddle {
-namespace operators {
-
-class TargetAssignOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    // checkout inputs
-    PADDLE_ENFORCE(ctx->HasInput("EncodedGTBBox"),
-                   "Input(EncodedGTBBox) of TargetAssignOp should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("GTScoreLabel"),
-                   "Input(GTScoreLabel) of TargetAssignOp should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("MatchIndices"),
-                   "Input(MatchIndices) of TargetAssignOp should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("NegIndices"),
-                   "Input(NegIndices) of TargetAssignOp should not be null");
-
-    // checkout outputs
-    PADDLE_ENFORCE(
-        ctx->HasOutput("PredBBoxLabel"),
-        "Output(PredBBoxLabel) of TargetAssignOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("PredBBoxWeight"),
-        "Output(PredBBoxWeight) of TargetAssignOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("PredScoreLabel"),
-        "Output(PredScoreLabel) of TargetAssignOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("PredScoreWeight"),
-        "Output(PredScoreWeight) of TargetAssignOp should not be null.");
-
-    auto blabel_dims = ctx->GetInputDim("EncodedGTBBox");
-    auto slabel_dims = ctx->GetInputDim("GTScoreLabel");
-    auto mi_dims = ctx->GetInputDim("MatchIndices");
-    auto neg_dims = ctx->GetInputDim("NegIndices");
-
-    PADDLE_ENFORCE_EQ(blabel_dims.size(), 3UL,
-                      "The rank of Input(EncodedGTBBox) must be 3.");
-    PADDLE_ENFORCE_EQ(slabel_dims.size(), 2UL,
-                      "The rank of Input(GTScoreLabel) must be 2.");
-    PADDLE_ENFORCE_EQ(mi_dims.size(), 2UL,
-                      "The rank of Input(MatchIndices) must be 2.");
-    PADDLE_ENFORCE_EQ(neg_dims.size(), 2UL,
-                      "The rank of Input(NegIndices) must be 2.");
-
-    PADDLE_ENFORCE_EQ(blabel_dims[0], slabel_dims[0],
-                      "The 1st dimension (means the total number of "
-                      "ground-truth bounding boxes) of Input(EncodedGTBBox) "
-                      "and Input(GTScoreLabel) must be the same.");
-    PADDLE_ENFORCE_EQ(blabel_dims[1], mi_dims[1],
-                      "The 2nd dimension (means the number of priod boxes) "
-                      "of Input(EncodedGTBBox) and "
-                      "Input(MatchIndices) must be the same.");
-    PADDLE_ENFORCE_EQ(blabel_dims[2], 4,
-                      "The 3rd dimension of Input(EncodedGTBBox) must be 4.");
-
-    auto n = mi_dims[0];
-    auto np = mi_dims[1];
-    ctx->SetOutputDim("PredBBoxLabel", {n, np, 4});
-    ctx->SetOutputDim("PredBBoxWeight", {n, np, 1});
-    ctx->SetOutputDim("PredScoreLabel", {n, np, 1});
-    ctx->SetOutputDim("PredScoreWeight", {n, np, 1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<framework::LoDTensor>("EncodedGTBBox")->type()),
-        ctx.device_context());
-  }
-};
-
-class TargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  TargetAssignOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("EncodedGTBBox",
-             "(LoDTensor), The encoded ground-truth bounding boxes with shape "
-             "[Ng, Np, 4], where Ng is the total number of ground-truth boxes "
-             "in this mini-batch, Np the number of predictions, 4 is the "
-             "number of coordinate in [xmin, ymin, xmax, ymax] layout.");
-    AddInput("GTScoreLabel",
-             "(LoDTensor, default LoDTensor<int>),  The input ground-truth "
-             "labels with shape [Ng, 1], where the Ng is the same as it in "
-             "the input of EncodedGTBBox.");
-    AddInput("MatchIndices",
-             "(Tensor, default Tensor<int>), The input matched indices "
-             "with shape [N, Np], where N is the batch size, Np is the same "
-             "as it in the input of EncodedGTBBox. If MatchIndices[i][j] "
-             "is -1, the j-th prior box is not matched to any ground-truh "
-             "box in i-th instance.");
-    AddInput("NegIndices",
-             "(LoDTensor, default LoDTensor<int>), The input negative example "
-             "indices with shape [Neg, 1], where is the total number of "
-             "negative example indices.");
-    AddAttr<int>("background_label",
-                 "(int, default 0), Label index of background class.")
-        .SetDefault(0);
-    AddOutput("PredBBoxLabel",
-              "(Tensor), The output encoded ground-truth labels "
-              "with shape [N, Np, 4], N is the batch size and Np, 4 is the "
-              "same as they in input of EncodedGTBBox. If MatchIndices[i][j] "
-              "is -1, the PredBBoxLabel[i][j][:] is the encoded ground-truth "
-              "box for background_label in i-th instance.");
-    AddOutput("PredBBoxWeight",
-              "(Tensor), The weight for PredBBoxLabel with the shape "
-              "of [N, Np, 1]");
-    AddOutput("PredScoreLabel",
-              "(Tensor, default Tensor<int>), The output score labels for "
-              "each predictions with shape [N, Np, 1]. If MatchIndices[i][j] "
-              "is -1, PredScoreLabel[i][j] = background_label.");
-    AddOutput("PredScoreWeight",
-              "(Tensor), The weight for PredScoreLabel with the shape "
-              "of [N, Np, 1]");
-    AddComment(R"DOC(
-This operator is, for given the encoded boxes between prior boxes and
-ground-truth boxes and ground-truth class labels, to assign classification
-and regression targets to each prior box as well as weights to each
-prior box. The weights is used to specify which prior box would not contribute
-to training loss.
-
-For each instance, the output `PredBBoxLabel`, `PredBBoxWeight`,
-`PredScoreLabel` and `PredScoreWeight` are assigned based on `MatchIndices`.
-Assumed that the row offset for each instance in `EncodedGTBBox` is called lod,
-this operato assigns classification/regression targets by performing the
-following steps:
-
-1. Assigning all outpts based on `MatchIndices`:
-
-If id = MatchIndices[i][j] > 0,
-
-    PredBBoxLabel[i][j] = EncodedGTBBox[lod[i] + id][j]
-    PredBBoxWeight[i][j] = 1.
-    PredScoreLabel[i][j] = GTScoreLabel[lod[i] + id]
-    PredScoreWeight[i][j] = 1.
-
-Otherwise, 
-
-    PredBBoxLabel[j][j] = [0., 0., 0., 0.]
-    PredBBoxWeight[i][j] = 0.
-    PredScoreLabel[i][j] = background_label
-    PredScoreWeight[i][j] = 0.
-
-2. Assigning PredScoreWeight based on `NegIndices`:
-
-Assumed that the row offset for each instance in `NegIndices` is caleed neg_lod,
-for i-th instance and all ids of NegIndices in this instance:
-
-    PredScoreLabel[i][id] = background_label
-    PredScoreWeight[i][id] = 1.0
-
-    )DOC");
-  }
-};
-
-template <typename T>
-struct NegTargetAssignFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx, const int* neg_indices,
-                  const size_t* lod, const int num, const int num_prior_box,
-                  const int background_label, int* out_label, T* out_label_wt) {
-    for (int i = 0; i < num; ++i) {
-      for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
-        int id = neg_indices[j];
-        out_label[i * num_prior_box + id] = background_label;
-        out_label_wt[i * num_prior_box + id] = static_cast<T>(1.0);
-      }
-    }
-  }
-};
-
-template struct NegTargetAssignFunctor<platform::CPUDeviceContext, float>;
-template struct NegTargetAssignFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(target_assign, ops::TargetAssignOp,
-                             ops::TargetAssignOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    target_assign,
-    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/target_assign_op.cu b/paddle/operators/target_assign_op.cu
deleted file mode 100644
index fc0a1000a4202adeca3e0d6fbb05e832a79dbaba..0000000000000000000000000000000000000000
--- a/paddle/operators/target_assign_op.cu
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/target_assign_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void NegTargetAssignKernel(const int* neg_indices, const size_t* lod,
-                                      const int num, const int num_prior_box,
-                                      const int background_label,
-                                      int* out_label, T* out_label_wt) {
-  int bidx = blockIdx.x;
-  int st = lod[bidx];
-  int ed = lod[bidx + 1];
-
-  int row_start = bidx * num_prior_box;
-  for (int i = st + threadIdx.x; i < ed; i += blockDim.x) {
-    int id = row_start + neg_indices[i];
-    out_label[id] = background_label;
-    out_label_wt[id] = 1.;
-  }
-}
-
-template <typename T>
-struct NegTargetAssignFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const int* neg_indices, const size_t* lod, const int num,
-                  const int num_prior_box, const int background_label,
-                  int* out_label, T* out_label_wt) {
-    const int block_size = 256;
-    const int grid_size = num;
-    NegTargetAssignKernel<T><<<grid_size, block_size, 0, ctx.stream()>>>(
-        neg_indices, lod, num, num_prior_box, background_label, out_label,
-        out_label_wt);
-  }
-};
-
-template struct NegTargetAssignFunctor<platform::CUDADeviceContext, float>;
-template struct NegTargetAssignFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    target_assign,
-    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/target_assign_op.h b/paddle/operators/target_assign_op.h
deleted file mode 100644
index 82fca5724c0bd9fbfb60a98b91944700bfab9cdf..0000000000000000000000000000000000000000
--- a/paddle/operators/target_assign_op.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct TargetAssignFunctor {
-  const T* gt_box_;
-  const int* gt_label_;
-  const int* match_indices_;
-  const size_t* lod_;
-  const int background_label_;
-  const int64_t num_;
-  const int64_t num_prior_box_;
-
-  T* out_box_;
-  T* out_box_wt_;
-  int* out_label_;
-  T* out_label_wt_;
-
-  TargetAssignFunctor(const T* gt_box, const int* gt_label,
-                      const int* match_indices, const size_t* lod,
-                      const int background_label, const int64_t num,
-                      const int64_t np, T* out_box, T* out_box_wt,
-                      int* out_label, T* out_label_wt)
-      : gt_box_(gt_box),
-        gt_label_(gt_label),
-        match_indices_(match_indices),
-        lod_(lod),
-        background_label_(background_label),
-        num_(num),
-        num_prior_box_(np),
-        out_box_(out_box),
-        out_box_wt_(out_box_wt),
-        out_label_(out_label),
-        out_label_wt_(out_label_wt) {}
-
-  HOSTDEVICE void operator()(size_t i) const {
-    int row = i / num_prior_box_;
-    int col = i - row * num_prior_box_;
-
-    size_t row_off = lod_[row];
-    int offset = row * num_prior_box_ + col;
-
-    int id = match_indices_[offset];
-    T* obox = out_box_ + offset * 4;
-    int* olabel = out_label_ + offset;
-    T* obox_wt = out_box_wt_ + offset;
-    T* olabel_wt = out_label_wt_ + offset;
-
-    if (id > -1) {
-      const T* gtbox = gt_box_ + ((row_off + id) * num_prior_box_ + col) * 4;
-
-      obox[0] = gtbox[0];
-      obox[1] = gtbox[1];
-      obox[2] = gtbox[2];
-      obox[3] = gtbox[3];
-
-      olabel[0] = gt_label_[row_off + id];
-      obox_wt[0] = static_cast<T>(1.);
-      olabel_wt[0] = static_cast<T>(1.);
-    } else {
-      obox[0] = static_cast<T>(0.);
-      obox[1] = static_cast<T>(0.);
-      obox[2] = static_cast<T>(0.);
-      obox[3] = static_cast<T>(0.);
-
-      olabel[0] = background_label_;
-      obox_wt[0] = static_cast<T>(0.);
-      olabel_wt[0] = static_cast<T>(0.);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-struct NegTargetAssignFunctor {
-  void operator()(const platform::DeviceContext& ctx, const int* neg_indices,
-                  const size_t* lod, const int num, const int num_prior_box,
-                  const int background_label, int* out_label,
-                  T* out_label_wt) const;
-};
-
-template <typename DeviceContext, typename T>
-class TargetAssignKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* enc_gt_box = ctx.Input<framework::LoDTensor>("EncodedGTBBox");
-    auto* gt_label = ctx.Input<framework::LoDTensor>("GTScoreLabel");
-    auto* match_indices = ctx.Input<framework::Tensor>("MatchIndices");
-    auto* neg_indices = ctx.Input<framework::LoDTensor>("NegIndices");
-
-    auto* out_box = ctx.Output<framework::Tensor>("PredBBoxLabel");
-    auto* out_box_wt = ctx.Output<framework::Tensor>("PredBBoxWeight");
-    auto* out_label = ctx.Output<framework::Tensor>("PredScoreLabel");
-    auto* out_label_wt = ctx.Output<framework::Tensor>("PredScoreWeight");
-
-    PADDLE_ENFORCE_EQ(enc_gt_box->lod().size(), 1UL);
-    PADDLE_ENFORCE_EQ(gt_label->lod().size(), 1UL);
-    PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL);
-
-    int background_label = ctx.Attr<int>("background_label");
-
-    const T* box_data = enc_gt_box->data<T>();
-    const int* label_data = gt_label->data<int>();
-    const int* match_idx_data = match_indices->data<int>();
-    const int* neg_idx_data = neg_indices->data<int>();
-
-    T* obox_data = out_box->mutable_data<T>(ctx.GetPlace());
-    T* obox_wt_data = out_box_wt->mutable_data<T>(ctx.GetPlace());
-    int* olabel_data = out_label->mutable_data<int>(ctx.GetPlace());
-    T* olabel_wt_data = out_label_wt->mutable_data<T>(ctx.GetPlace());
-
-    int64_t num = match_indices->dims()[0];
-    int64_t num_prior_box = match_indices->dims()[1];
-
-    auto gt_lod = enc_gt_box->lod().back();
-    auto gt_label_lod = gt_label->lod().back();
-    auto neg_lod = neg_indices->lod().back();
-    for (size_t i = 0; i < gt_lod.size(); ++i) {
-      PADDLE_ENFORCE_EQ(gt_lod.data()[i], gt_label_lod.data()[i]);
-    }
-
-    size_t* gt_lod_data = gt_lod.data(ctx.GetPlace());
-    size_t* neg_lod_data = neg_lod.data(ctx.GetPlace());
-
-    TargetAssignFunctor<T> functor(box_data, label_data, match_idx_data,
-                                   gt_lod_data, background_label, num,
-                                   num_prior_box, obox_data, obox_wt_data,
-                                   olabel_data, olabel_wt_data);
-
-    auto& device_ctx = ctx.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(device_ctx,
-                                                num * num_prior_box);
-    for_range(functor);
-
-    NegTargetAssignFunctor<DeviceContext, T> neg_trg_functor;
-    neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, num, num_prior_box,
-                    background_label, olabel_data, olabel_wt_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc
deleted file mode 100644
index a70be8b8752d12433bb19b9953d80e397858834c..0000000000000000000000000000000000000000
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/operators/array_operator.h"
-#include "paddle/operators/detail/safe_ref.h"
-namespace paddle {
-namespace operators {
-
-class WriteToArrayOp : public ArrayOp {
- public:
-  WriteToArrayOp(const std::string &type,
-                 const framework::VariableNameMap &inputs,
-                 const framework::VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs)
-      : ArrayOp(type, inputs, outputs, attrs) {}
-
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    auto *x = scope.FindVar(Input("X"));
-    if (x == nullptr) return;
-    auto &x_tensor = x->Get<framework::LoDTensor>();
-    size_t offset = GetOffset(scope, place);
-    auto *out =
-        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
-    if (offset >= out->size()) {
-      VLOG(10) << "Resize " << Output("Out") << " from " << out->size()
-               << " to " << offset + 1;
-      out->resize(offset + 1);
-    }
-    if (x_tensor.memory_size() > 0) {
-      auto *out_tensor = &out->at(offset);
-
-      platform::DeviceContextPool &pool =
-          platform::DeviceContextPool::Instance();
-      auto &dev_ctx = *pool.Get(place);
-
-      Copy(x_tensor, place, dev_ctx, out_tensor);
-      out_tensor->set_lod(x_tensor.lod());
-    } else {
-      VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
-                  "nothing has been written to output array["
-               << offset << "].";
-    }
-  }
-};
-
-class WriteToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  WriteToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(LoDTensor) the tensor will be written to tensor array");
-    AddInput(
-        "I",
-        "(Tensor) the subscript index in tensor array. The number of element "
-        "should be 1");
-    AddOutput("Out", "(TensorArray) the tensor array will be written");
-    AddComment(R"DOC(
-WriteToArray Operator.
-
-This operator writes a LoDTensor to a LoDTensor array.
-
-Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The
-equation is
-
-$$A[i] = T$$
-
-)DOC");
-  }
-};
-
-class WriteToArrayInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("I"), "Must set the subscript index");
-    PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1,
-                      "The number of element of subscript index must be 1");
-    if (!context->HasInput("X")) {
-      return;
-    }
-    PADDLE_ENFORCE(context->HasOutput("Out"), NotHasOutError());
-    context->SetOutputDim("Out", context->GetInputDim("X"));
-  }
-
- protected:
-  virtual const char *NotHasXError() const { return "Must set the lod tensor"; }
-
-  virtual const char *NotHasOutError() const {
-    return "Must set the lod tensor array";
-  }
-};
-
-class WriteToArrayInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto x_name = op_desc.Input("X")[0];
-    auto out_name = op_desc.Output("Out")[0];
-    VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
-    auto &out = block->FindRecursiveOrCreateVar(out_name);
-    out.SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY);
-    auto *x = block->FindVarRecursive(x_name);
-    if (x != nullptr) {
-      out.SetDataType(x->GetDataType());
-    }
-  }
-};
-
-class ReadFromArrayOp : public ArrayOp {
- public:
-  ReadFromArrayOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs)
-      : ArrayOp(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
-    auto *x = scope.FindVar(Input("X"));
-    PADDLE_ENFORCE(x != nullptr, "X must be set");
-    auto &x_array = x->Get<framework::LoDTensorArray>();
-    auto *out = scope.FindVar(Output("Out"));
-    PADDLE_ENFORCE(out != nullptr, "Out must be set");
-    size_t offset = GetOffset(scope, place);
-    if (offset < x_array.size()) {
-      auto *out_tensor = out->GetMutable<framework::LoDTensor>();
-      platform::DeviceContextPool &pool =
-          platform::DeviceContextPool::Instance();
-      auto &dev_ctx = *pool.Get(place);
-      framework::Copy(x_array[offset], place, dev_ctx, out_tensor);
-      out_tensor->set_lod(x_array[offset].lod());
-    } else {
-      VLOG(10) << "offset " << offset << " >= " << x_array.size();
-    }
-  }
-};
-
-class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ReadFromArrayProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(TensorArray) the array will be read from.");
-    AddInput("I",
-             "(Tensor) the subscript index in tensor array. The number of "
-             "element should be 1");
-    AddOutput("Out", "(LoDTensor) the tensor will be read from.");
-    AddComment(R"DOC(
-ReadFromArray Operator.
-
-Read a LoDTensor from a LoDTensor Array.
-
-Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The
-equation is
-
-$$T = A[i]$$
-
-)DOC");
-  }
-};
-
-class ReadFromArrayInferShape : public WriteToArrayInferShape {
- protected:
-  const char *NotHasXError() const override {
-    return "The input array X must be set";
-  }
-  const char *NotHasOutError() const override {
-    return "The output tensor out must be set";
-  }
-};
-
-class WriteToArrayGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("read_from_array");
-    grad_op->SetInput("I", Input("I"));
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-class ReadFromArrayGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDesc();
-    grad_op->SetType("write_to_array");
-    grad_op->SetInput("I", Input("I"));
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(grad_op);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(write_to_array, ops::WriteToArrayOp,
-                  ops::WriteToArrayInferShape, ops::WriteToArrayOpProtoMaker,
-                  ops::WriteToArrayGradMaker, ops::WriteToArrayInferVarType);
-REGISTER_OPERATOR(read_from_array, ops::ReadFromArrayOp,
-                  ops::ReadFromArrayInferShape, ops::ReadFromArrayProtoMaker,
-                  ops::ReadFromArrayGradMaker);
diff --git a/paddle/operators/top_k_op.cc b/paddle/operators/top_k_op.cc
deleted file mode 100644
index a8ddd729732eb41618ea2788a6eb70a16a5ac70f..0000000000000000000000000000000000000000
--- a/paddle/operators/top_k_op.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/top_k_op.h"
-
-namespace paddle {
-namespace operators {
-
-class TopkOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of TopkOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of TopkOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Indices"),
-                   "Output(Indices) of TopkOp should not be null.");
-
-    auto input_dims = ctx->GetInputDim("X");
-    const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
-
-    PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
-    PADDLE_ENFORCE_GE(input_dims.size(), 1, "input must have >= 1d shape");
-    PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k,
-                      "input must have >= k columns");
-
-    framework::DDim dims = input_dims;
-    dims[dims.size() - 1] = k;
-    ctx->SetOutputDim("Out", dims);
-    ctx->SetOutputDim("Indices", dims);
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
-};
-
-class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  TopkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) The input of Topk op");
-    AddOutput("Out", "(Tensor) The output tensor of Topk op");
-    AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
-    AddComment(R"DOC(
-Top K operator
-
-If the input is a vector (1d tensor), this operator finds the k largest 
-entries in the vector and outputs their values and indices as vectors. 
-Thus values[j] is the j-th largest entry in input, and its index is indices[j].
-
-For matrices, this operator computes the top k entries in each row. )DOC");
-    AddAttr<int>("k",
-                 "(int, default 1) Number of top elements to look for along "
-                 "the last dimension (along each row for matrices).")
-        .SetDefault(1);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(top_k, ops::TopkOp, ops::TopkOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(top_k,
-                       ops::TopkKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/top_k_op.cu b/paddle/operators/top_k_op.cu
deleted file mode 100644
index f7bf58e7218cb9b94526cb9346bc5f9aa971038a..0000000000000000000000000000000000000000
--- a/paddle/operators/top_k_op.cu
+++ /dev/null
@@ -1,320 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/assert.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-struct Pair {
-  __device__ __forceinline__ Pair() {}
-  __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {}
-
-  __device__ __forceinline__ void set(T value, int64_t id) {
-    v = value;
-    id = id;
-  }
-
-  __device__ __forceinline__ void operator=(const Pair<T>& in) {
-    v = in.v;
-    id = in.id;
-  }
-
-  __device__ __forceinline__ bool operator<(const T value) const {
-    return (v < value);
-  }
-
-  __device__ __forceinline__ bool operator<(const Pair<T>& in) const {
-    return (v < in.v) || ((v == in.v) && (id > in.id));
-  }
-
-  __device__ __forceinline__ bool operator>(const Pair<T>& in) const {
-    return (v > in.v) || ((v == in.v) && (id < in.id));
-  }
-
-  T v;
-  int64_t id;
-};
-
-template <typename T>
-__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
-                                      int beam_size) {
-  for (int k = beam_size - 2; k >= 0; k--) {
-    if (topk[k] < p) {
-      topk[k + 1] = topk[k];
-    } else {
-      topk[k + 1] = p;
-      return;
-    }
-  }
-  topk[0] = p;
-}
-
-template <typename T, int beam_size>
-__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p) {
-  for (int k = beam_size - 2; k >= 0; k--) {
-    if (topk[k] < p) {
-      topk[k + 1] = topk[k];
-    } else {
-      topk[k + 1] = p;
-      return;
-    }
-  }
-  topk[0] = p;
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
-                                        int dim, int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < src[idx]) {
-      Pair<T> tmp(src[idx], idx);
-      AddTo<T>(topk, tmp, beam_size);
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
-                                        int dim, const Pair<T>& max,
-                                        int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < src[idx]) {
-      Pair<T> tmp(src[idx], idx);
-      if (tmp < max) {
-        AddTo<T>(topk, tmp, beam_size);
-      }
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
-                                        int idx, int dim, int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < val[idx]) {
-      Pair<T> tmp(val[idx], col[idx]);
-      AddTo<T>(topk, tmp, beam_size);
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
-                                        int idx, int dim, const Pair<T>& max,
-                                        int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < val[idx]) {
-      Pair<T> tmp(val[idx], col[idx]);
-      if (tmp < max) {
-        AddTo<T>(topk, tmp, beam_size);
-      }
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
-                                              int beam_size, const T* src,
-                                              bool& firstStep, bool& is_empty,
-                                              Pair<T>& max, int dim,
-                                              const int tid) {
-  if (beam > 0) {
-    int length = beam < beam_size ? beam : beam_size;
-    if (firstStep) {
-      firstStep = false;
-      GetTopK<T, BlockSize>(topk, src, tid, dim, length);
-    } else {
-      for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - beam) {
-          topk[k] = topk[k + beam];
-        } else {
-          topk[k].set(-INFINITY, -1);
-        }
-      }
-      if (!is_empty) {
-        GetTopK<T, BlockSize>(topk + MaxLength - beam, src, tid, dim, max,
-                              length);
-      }
-    }
-
-    max = topk[MaxLength - 1];
-    if (max.v == -1) is_empty = true;
-    beam = 0;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
-                                              int beam_size, const T* val,
-                                              int* col, bool& firstStep,
-                                              bool& is_empty, Pair<T>& max,
-                                              int dim, const int tid) {
-  if (beam > 0) {
-    int length = beam < beam_size ? beam : beam_size;
-    if (firstStep) {
-      firstStep = false;
-      GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
-    } else {
-      for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - beam) {
-          topk[k] = topk[k + beam];
-        } else {
-          topk[k].set(-INFINITY, -1);
-        }
-      }
-      if (!is_empty) {
-        GetTopK<T, BlockSize>(topk + MaxLength - beam, val, col, tid, dim, max,
-                              length);
-      }
-    }
-
-    max = topk[MaxLength - 1];
-    if (max.v == -1) is_empty = true;
-    beam = 0;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
-                                            Pair<T> topk[], T** topVal,
-                                            int64_t** topIds, int& beam, int& k,
-                                            const int tid, const int warp) {
-  while (true) {
-    __syncthreads();
-    if (tid < BlockSize / 2) {
-      if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) {
-        maxid[tid] = tid + BlockSize / 2;
-      } else {
-        maxid[tid] = tid;
-      }
-    }
-    __syncthreads();
-    for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) {
-      if (tid < stride) {
-        if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) {
-          maxid[tid] = maxid[tid + stride];
-        }
-      }
-      __syncthreads();
-    }
-    __syncthreads();
-
-    if (tid == 0) {
-      **topVal = sh_topk[maxid[0]].v;
-      **topIds = sh_topk[maxid[0]].id;
-      (*topVal)++;
-      (*topIds)++;
-    }
-    if (tid == maxid[0]) beam++;
-    if (--k == 0) break;
-    __syncthreads();
-
-    if (tid == maxid[0]) {
-      if (beam < MaxLength) {
-        sh_topk[tid] = topk[beam];
-      }
-    }
-    if (maxid[0] / 32 == warp) {
-      if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break;
-    }
-  }
-}
-
-/**
- * Each block compute one sample.
- * In a block:
- * 1. every thread get top MaxLength value;
- * 2. merge to sh_topk, block reduce and get max value;
- * 3. go to the second setp, until one thread's topk value is null;
- * 4. go to the first setp, until get the topk value.
- */
-template <typename T, int MaxLength, int BlockSize>
-__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
-                             const T* src, int lds, int dim, int k) {
-  __shared__ Pair<T> sh_topk[BlockSize];
-  __shared__ int maxid[BlockSize / 2];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-  output += blockIdx.x * output_stride;
-  indices += blockIdx.x * k;
-
-  Pair<T> topk[MaxLength];
-  int beam = MaxLength;
-  Pair<T> max;
-  bool is_empty = false;
-  bool firststep = true;
-
-  for (int k = 0; k < MaxLength; k++) {
-    topk[k].set(-INFINITY, -1);
-  }
-  while (k) {
-    ThreadGetTopK<T, MaxLength, BlockSize>(topk, beam, k,
-                                           src + blockIdx.x * lds, firststep,
-                                           is_empty, max, dim, tid);
-
-    sh_topk[tid] = topk[0];
-    BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
-                                         &indices, beam, k, tid, warp);
-  }
-}
-
-template <typename T>
-class TopkOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
-    size_t k = static_cast<int>(ctx.Attr<int>("k"));
-
-    const T* input_data = input->data<T>();
-
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    // FIXME(typhoonzero): data is always converted to type T?
-    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    size_t input_height = input->dims()[0];
-    size_t input_width = input->dims()[1];
-    if (k > input_width) k = input_width;
-
-    // NOTE: pass lds and dim same to input width.
-    // NOTE: old matrix implementation of stride is different to eigen.
-    // TODO(typhoonzero): refine this kernel.
-    dim3 threads(256, 1);
-    dim3 grid(input_height, 1);
-
-    KeMatrixTopK<T, 5, 256><<<
-        grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              ctx.device_context())
-                              .stream()>>>(output_data, output->dims()[1],
-                                           indices_data, input_data,
-                                           input_width, input_width, int(k));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>);
diff --git a/paddle/operators/top_k_op.h b/paddle/operators/top_k_op.h
deleted file mode 100644
index bf42e15e6b234125d9ec24e8500367b9915213ab..0000000000000000000000000000000000000000
--- a/paddle/operators/top_k_op.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <iostream>
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class TopkKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // Get the top k elements of each row of input tensor
-    // FIXME: only deal with matrix(2d tensor).
-    auto* input = ctx.Input<LoDTensor>("X");
-    auto* output = ctx.Output<LoDTensor>("Out");
-    auto* indices = ctx.Output<LoDTensor>("Indices");
-    // k is determined by Attr
-    const size_t k = static_cast<int>(ctx.Attr<int>("k"));
-
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    auto eg_input = EigenMatrix<T>::From(*input);
-
-    // reshape input to a flattern matrix(like flat_inner_dims)
-    framework::DDim inputdims = input->dims();
-    const size_t row = framework::product(
-        framework::slice_ddim(inputdims, 0, inputdims.size() - 1));
-    const size_t col = inputdims[inputdims.size() - 1];
-    Eigen::DSizes<int, 2> flat2dims(row, col);
-    // NOTE: eigen shape doesn't affect paddle tensor.
-    eg_input.reshape(flat2dims);
-
-    for (size_t i = 0; i < row; i++) {
-      std::vector<std::pair<T, size_t>> vec;
-      for (size_t j = 0; j < col; j++) {
-        vec.push_back(std::pair<T, size_t>(eg_input(i, j), j));
-      }
-
-      std::partial_sort(
-          vec.begin(), vec.begin() + k, vec.end(),
-          [](const std::pair<T, size_t>& l, const std::pair<T, size_t>& r) {
-            return l.first > r.first;
-          });
-      for (size_t j = 0; j < k; j++) {
-        output_data[i * k + j] = vec[j].first;
-        indices_data[i * k + j] = int64_t(vec[j].second);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/transpose_op.cc b/paddle/operators/transpose_op.cc
deleted file mode 100644
index c7ae162638ca5e929cca14c841cc3eceeea5f64e..0000000000000000000000000000000000000000
--- a/paddle/operators/transpose_op.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/transpose_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class TransposeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
-    auto x_dims = ctx->GetInputDim("X");
-    std::vector<int> axis = ctx->Attrs().Get<std::vector<int>>("axis");
-    size_t x_rank = x_dims.size();
-    size_t axis_size = axis.size();
-
-    PADDLE_ENFORCE_EQ(x_rank, axis_size,
-                      "The input tensor's rank(%d) "
-                      "should be equal to the axis's size(%d)",
-                      x_rank, axis_size);
-
-    std::vector<int> count(axis_size, 0);
-    for (size_t i = 0; i < axis_size; i++) {
-      PADDLE_ENFORCE(
-          axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1,
-          "Each element of Attribute axis should be a unique value "
-          "range from 0 to (dims - 1), "
-          "where the dims is the axis's size");
-    }
-
-    framework::DDim out_dims(x_dims);
-    for (size_t i = 0; i < axis_size; i++) {
-      out_dims[i] = x_dims[axis[i]];
-    }
-    ctx->SetOutputDim("Out", out_dims);
-  }
-};
-
-class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  TransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor, tensors with rank up to 6 are supported.");
-    AddOutput("Out", "(Tensor)The output tensor.");
-    AddAttr<std::vector<int>>(
-        "axis",
-        "(vector<int>) A list of values, and the size of the list should be "
-        "the same with the input tensor rank. This operator permutes the input "
-        "tensor's axes according to the values given.");
-    AddComment(R"DOC(
-Transpose Operator.
-
-The input tensor will be permuted according to the axes given.
-The behavior of this operator is similar to how `numpy.transpose` works.
-
-- suppose the input `X` is a 2-D tensor:
-    $$
-    X = \begin{pmatrix}
-    0 &1 &2 \\
-    3 &4 &5
-    \end{pmatrix}$$
-
-    the given `axes` is: $[1, 0]$, and $Y$ = transpose($X$, axis)
-
-    then the output $Y$ is:
-
-    $$
-    Y = \begin{pmatrix}
-         0 &3 \\
-         1 &4  \\
-         2 &5
-    \end{pmatrix}$$
-
-- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is 
-$[0, 2, 3, 1]$, then shape of the output tensor will be: $(N, H, W, C)$.
-
-)DOC");
-  }
-};
-
-class TransposeOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(transpose, ops::TransposeOp, ops::TransposeOpMaker, transpose_grad,
-            ops::TransposeOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    transpose_grad,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/transpose_op.cu.cc b/paddle/operators/transpose_op.cu.cc
deleted file mode 100644
index 281c4468cc267c659befd238c9a286dd23eaf16d..0000000000000000000000000000000000000000
--- a/paddle/operators/transpose_op.cu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/transpose_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    transpose,
-    ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    transpose_grad,
-    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/transpose_op.h b/paddle/operators/transpose_op.h
deleted file mode 100644
index b9686a2db3f76dbb9b8ebdba5e243f5e5a3c571a..0000000000000000000000000000000000000000
--- a/paddle/operators/transpose_op.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-inline void TransCompute(const int dim, const DeviceContext& dev_ctx,
-                         const framework::Tensor& in, framework::Tensor* out,
-                         const std::vector<int>& axis) {
-  switch (dim) {
-    case 1:
-      math::Transpose<DeviceContext, T, 1> trans1;
-      trans1(dev_ctx, in, out, axis);
-      break;
-    case 2:
-      math::Transpose<DeviceContext, T, 2> trans2;
-      trans2(dev_ctx, in, out, axis);
-      break;
-    case 3:
-      math::Transpose<DeviceContext, T, 3> trans3;
-      trans3(dev_ctx, in, out, axis);
-      break;
-    case 4:
-      math::Transpose<DeviceContext, T, 4> trans4;
-      trans4(dev_ctx, in, out, axis);
-      break;
-    case 5:
-      math::Transpose<DeviceContext, T, 5> trans5;
-      trans5(dev_ctx, in, out, axis);
-      break;
-    case 6:
-      math::Transpose<DeviceContext, T, 6> trans6;
-      trans6(dev_ctx, in, out, axis);
-      break;
-    default:
-      PADDLE_THROW("Tensors with rank at most 6 are supported");
-  }
-}
-
-template <typename DeviceContext, typename T>
-class TransposeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
-    int ndims = axis.size();
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    TransCompute<DeviceContext, T>(ndims, dev_ctx, *x, out, axis);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TransposeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    if (!x_grad) return;
-
-    x_grad->mutable_data<T>(context.GetPlace());
-    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
-    std::vector<int> reversed_axis(axis);
-
-    for (size_t i = 0; i < axis.size(); i++) {
-      reversed_axis[axis[i]] = i;
-    }
-
-    int ndims = axis.size();
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    TransCompute<DeviceContext, T>(ndims, dev_ctx, *out_grad, x_grad,
-                                   reversed_axis);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
deleted file mode 100644
index 3a314bdb9b058f5dfc61b35795000298c40551e6..0000000000000000000000000000000000000000
--- a/paddle/operators/uniform_random_op.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
-// Use std::random and thrust::random(thrust is a std library in CUDA) to
-// implement uniform random.
-template <typename T>
-class CPUUniformRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* tensor = ctx.Output<framework::Tensor>("Out");
-    T* data = tensor->mutable_data<T>(ctx.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
-    std::uniform_real_distribution<T> dist(
-        static_cast<T>(ctx.Attr<float>("min")),
-        static_cast<T>(ctx.Attr<float>("max")));
-    int64_t size = tensor->numel();
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(engine);
-    }
-  }
-};
-
-class UniformRandomOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of UniformRandomOp should not be null.");
-
-    PADDLE_ENFORCE(
-        ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
-        "uniform_random's min must less then max");
-    auto& shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    std::vector<int64_t> temp;
-    temp.reserve(shape.size());
-    for (auto dim : shape) {
-      temp.push_back(static_cast<int64_t>(dim));
-    }
-    ctx->SetOutputDim("Out", framework::make_ddim(temp));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
-        ctx.GetPlace());
-  }
-};
-
-class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  UniformRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddOutput("Out", "(Tensor) The output tensor of uniform random op");
-    AddComment(R"DOC(
-Uniform random operator.
-
-This operator initializes a tensor with random values sampled from a 
-uniform distribution.
-
-)DOC");
-    AddAttr<std::vector<int>>("shape",
-                              "(vector<int>) The shape of the output tensor");
-    AddAttr<float>("min",
-                   "(float, default -1.0) "
-                   "Minimum value of uniform random")
-        .SetDefault(-1.0f);
-    AddAttr<float>("max",
-                   "(float, default 1.0) "
-                   "Maximun value of uniform random")
-        .SetDefault(1.0f);
-    AddAttr<int>("seed",
-                 "(int, default 0) "
-                 "Random seed used for generating samples. "
-                 "0 means use a seed generated by the system.")
-        .SetDefault(0);
-    AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
-        .SetDefault(framework::proto::DataType::FP32);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_WITHOUT_GRADIENT(uniform_random, paddle::operators::UniformRandomOp,
-                             paddle::operators::UniformRandomOpMaker);
-REGISTER_OP_CPU_KERNEL(uniform_random,
-                       paddle::operators::CPUUniformRandomKernel<float>,
-                       paddle::operators::CPUUniformRandomKernel<double>);
diff --git a/paddle/operators/uniform_random_op.cu b/paddle/operators/uniform_random_op.cu
deleted file mode 100644
index 719d0872a7cba55dc97c95a73a7f86614ab3f4b7..0000000000000000000000000000000000000000
--- a/paddle/operators/uniform_random_op.cu
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct UniformGenerator {
-  T min_, max_;
-  unsigned int seed_;
-
-  __host__ __device__ UniformGenerator(T min, T max, int seed)
-      : min_(min), max_(max), seed_(seed) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(min_, max_);
-    rng.discard(n);
-    return dist(rng);
-  }
-};
-
-// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
-// Use std::random and thrust::random(thrust is a std library in CUDA) to
-// implement uniform random.
-template <typename T>
-class GPUUniformRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-    }
-    T min = static_cast<T>(context.Attr<float>("min"));
-    T max = static_cast<T>(context.Attr<float>("max"));
-    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
-    int64_t size = tensor->numel();
-    thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                      thrust::device_ptr<T>(data),
-                      UniformGenerator<T>(min, max, seed));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(uniform_random,
-                        paddle::operators::GPUUniformRandomKernel<float>,
-                        paddle::operators::GPUUniformRandomKernel<double>);
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
deleted file mode 100644
index 50cee11a7a2d6a2483c6851e888b61475cba1376..0000000000000000000000000000000000000000
--- a/paddle/operators/unpool_op.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/unpool_op.h"
-namespace paddle {
-namespace operators {
-
-class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  Unpool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor of unpool operator. "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
-        "number of channels, H and W is the height and width of feature.");
-    AddInput(
-        "Indices",
-        "(Tensor) The input tensor of the indices given out by MaxPool2d. "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
-        "number of channels, H and W is the height and width of feature.");
-    AddOutput("Out",
-              "(Tensor) The output tensor of unpool operator."
-              "The format of output tensor is also NCHW."
-              "Where N is batch size, C is "
-              "the number of channels, H and W is the height and "
-              "width of feature.");
-    AddAttr<std::vector<int>>(
-        "ksize",
-        "(vector), the unpooling window size(height, width) "
-        "of unpooling operator.");
-    AddAttr<std::vector<int>>("strides",
-                              "(vector, default:{1, 1}), "
-                              "strides (height, width) of unpooling operator.")
-        .SetDefault({1, 1});
-    AddAttr<std::vector<int>>("paddings",
-                              "(vector defalut:{0,0}), "
-                              "paddings (height, width) of unpooling operator.")
-        .SetDefault({0, 0});
-    AddAttr<std::string>(
-        "unpooling_type",
-        "(string), unpooling type, can be \"max\" for max-unpooling ")
-        .InEnum({"max"});
-    AddComment(R"DOC(
-Input shape is: $(N, C_{in}, H_{in}, W_{in})$, Output shape is:
-$(N, C_{out}, H_{out}, W_{out})$, where
-$$
-H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
-W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
-$$
-Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf
-)DOC");
-  }
-};
-
-int OutputSize(int input_size, int ksize, int padding, int stride) {
-  int output_size = (input_size - 1) * stride - 2 * padding + ksize;
-  return output_size;
-}
-
-class UnpoolOp : public framework::OperatorWithKernel {
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
-  }
-
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of UnpoolOp"
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Indices"),
-                   "Input(Indices) of UnpoolOp"
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of UnpoolOp should not be null.");
-    auto in_x_dims = ctx->GetInputDim("X");
-    auto in_y_dims = ctx->GetInputDim("Indices");
-    std::string unpooling_type =
-        ctx->Attrs().Get<std::string>("unpooling_type");
-    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    PADDLE_ENFORCE(in_x_dims.size() == 4,
-                   "Unpooling intput must be of 4-dimensional.");
-    PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
-    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      output_shape.push_back(
-          OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
-    }
-    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-  }
-};
-
-class UnpoolOpGrad : public framework::OperatorWithKernel {
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
-  }
-
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Input(X@GRAD) should not be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
-            ops::UnpoolOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    unpool, ops::UnpoolKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnpoolKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    unpool_grad,
-    ops::UnpoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnpoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc
deleted file mode 100644
index 9b002e35c434db561114dbbafce2f3f934daaf6a..0000000000000000000000000000000000000000
--- a/paddle/operators/unpool_op.cu.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/unpool_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    unpool, ops::UnpoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnpoolKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    unpool_grad,
-    ops::UnpoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnpoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
deleted file mode 100644
index ee18b118c957c7933890000bbe934e6ffdc9e56f..0000000000000000000000000000000000000000
--- a/paddle/operators/unpool_op.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/unpooling.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class UnpoolKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
-    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
-    auto* out = context.Output<framework::Tensor>("Out");
-    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    T* output_data = out->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    if (output_data) {
-      math::SetConstant<DeviceContext, T> set_zero;
-      set_zero(dev_ctx, out, static_cast<T>(0));
-    }
-    math::Unpool2dMaxFunctor<DeviceContext, T> unpool2d_max_forward;
-    unpool2d_max_forward(dev_ctx, *in_x, *in_y, out);
-  }
-};
-template <typename DeviceContext, typename T>
-class UnpoolGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
-    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
-    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
-    const framework::Tensor* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    framework::Tensor* in_x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-
-    auto& device_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
-    if (in_x_grad) {
-      in_x_grad->mutable_data<T>(context.GetPlace());
-      zero(device_ctx, in_x_grad, static_cast<T>(0));
-    }
-    math::Unpool2dMaxGradFunctor<DeviceContext, T> unpool2d_max_backward;
-    unpool2d_max_backward(device_ctx, *in_x, *in_y, *out, *out_grad, in_x_grad);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/warpctc_op.cc b/paddle/operators/warpctc_op.cc
deleted file mode 100644
index bd0c5f9957663fb05670269aa6c39a976de7bc9c..0000000000000000000000000000000000000000
--- a/paddle/operators/warpctc_op.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/warpctc_op.h"
-
-namespace paddle {
-namespace operators {
-
-class WarpCTCOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Logits"),
-                   "Input(Logits) of WarpCTCOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Label"),
-                   "Input(Label) of WarpCTCOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("WarpCTCGrad"),
-                   "Output(WarpCTCGrad) of WarpCTCOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Loss"),
-                   "Output(Loss) of WarpCTCOp should not be null.");
-
-    auto logits_dims = ctx->GetInputDim("Logits");
-    int sequence_width =
-        static_cast<int>(framework::product(logits_dims) / logits_dims[0]);
-    int blank = ctx->Attrs().Get<int>("blank");
-    PADDLE_ENFORCE((blank >= 0) && (blank < sequence_width),
-                   "The value of Attr(blank) should be in interval [0, %d).",
-                   sequence_width);
-    // TODO(liuyiqun): it is tricky to set the wrong dimension here.
-    ctx->SetOutputDim("Loss", {logits_dims[0], 1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
-        ctx.device_context());
-  }
-};
-
-class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  WarpCTCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Logits",
-             "(LodTensor, default: LoDTensor<float>), the unscaled "
-             "probabilities of variable-length sequences, which is a 2-D "
-             "Tensor with LoD information. It's shape is "
-             "[Lp, num_classes + 1], where Lp is the sum of all input "
-             "sequences' length and num_classes is the true number of classes "
-             "(not including the blank label).");
-    AddInput("Label",
-             "(LodTensor, default: LoDTensor<int>), the ground truth "
-             "of variable-length sequence, which is a 2-D Tensor with LoD "
-             "information. It is of the shape [Lg, 1], where Lg is th sum of "
-             "all labels' length.");
-    AddOutput("WarpCTCGrad",
-              "(Tensor, default: Tensor<float>), a temporary "
-              "output Tensor to store the gradients of warp-ctc, which is "
-              "computed with loss together in one call. It is a 3-D Tensor of "
-              "the shape [max_sequence_length, batch_size, num_classes + 1].")
-        .AsIntermediate();
-    AddOutput("Loss",
-              "(Tensor, default: Tensor<float>), the Connectionist "
-              "Temporal Classification (CTC) loss, which is a 2-D Tensor of "
-              "the shape [batch_size, 1]");
-    AddAttr<int>("blank",
-                 "(int, default: 0), the blank label of Connectionist "
-                 "Temporal Classification (CTC) loss, which is in the "
-                 "half-opened interval [0, num_classes + 1).")
-        .SetDefault(0);
-    AddAttr<bool>("norm_by_times",
-                  "(bool, default: false), whether to "
-                  "normalize the gradients by the number of time-step, "
-                  "which is also the sequence's length.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-An operator integrating the open-source
-[warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in
-[Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin](
-https://arxiv.org/pdf/1512.02595v1.pdf),
-to compute Connectionist Temporal Classification (CTC) loss.
-It can be aliased as softmax with ctc, since a native softmax activation is
-interated to the warp-ctc library, to to normlize values for each row of the
-input tensor.
-
-More detail of CTC loss can be found by refering to
-[Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with
-Recurrent Neural Networks](
-http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf).
-)DOC");
-  }
-};
-
-class WarpCTCGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("WarpCTCGrad"),
-                   "Input(WarpCTCGrad) of WarpCTCGradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
-                   "Output(Logits@GRAD) of WarpCTCGradOp should not be null.");
-    ctx->SetOutputDim(framework::GradVarName("Logits"),
-                      ctx->GetInputDim("Logits"));
-    ctx->ShareLoD("Logits", /*->*/ framework::GradVarName("Logits"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
-        ctx.device_context());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker, warpctc_grad,
-            ops::WarpCTCGradOp);
-REGISTER_OP_CPU_KERNEL(
-    warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    warpctc_grad,
-    ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/warpctc_op.cu.cc b/paddle/operators/warpctc_op.cu.cc
deleted file mode 100644
index 7d8527ac75fd4964f95198417a213ff6155aac2a..0000000000000000000000000000000000000000
--- a/paddle/operators/warpctc_op.cu.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/warpctc_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    warpctc_grad,
-    ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/warpctc_op.h b/paddle/operators/warpctc_op.h
deleted file mode 100644
index 8aea061c00cc9614db37ed408b6d330ef707d1cf..0000000000000000000000000000000000000000
--- a/paddle/operators/warpctc_op.h
+++ /dev/null
@@ -1,229 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
-#include "paddle/operators/math/sequence_padding.h"
-#include "paddle/operators/math/sequence_scale.h"
-#include "paddle/platform/dynload/warpctc.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename DeviceContext>
-class WarpCTCFunctor {
- public:
-  /*
-   * \brief Compute the connectionist temporal classification loss,
-   *        and optionally compute the gradient with respect to the inputs.
-   *
-   * If gradient is nullptr, it only computes the ctc loss,
-   * or computes both ctc loss and gradient.
-   *
-   * \param ctx               execution context of this functor
-   * \param input             batch matrix of input probabilities, in
-   *                          max_sequence_length x num_sequences x
-   *                          sequence_width, (row-major) format
-   * \param gradient          batch matrix of gradient, with the same shape as
-   *                          input.
-   * \param cpu_labels        labels always in CPU memory.
-   * \param cpu_label_lengths length of all labels in CPU memory.
-   * \param cpu_input_lengths length of all sequences in CPU memory.
-   * \param sequence_width    number of possible output symbols.
-   * \param num_sequences     number of sequence.
-   * \param blank             blank label used in ctc loss function.
-   * \param cpu_losss         cost of each sequence in CPU memory.
-   */
-  void operator()(const framework::ExecutionContext& ctx, const float* input,
-                  float* gradient, const int* cpu_labels,
-                  const int* cpu_label_lengths, const int* cpu_input_lengths,
-                  const size_t sequence_width, const size_t num_sequences,
-                  const size_t blank, float* cpu_loss) {
-    // Init warp-ctc options
-    init(ctx, blank);
-
-    // Compute the required workspace size.
-    // There is no memory allocated operations within warp-ctc.
-    size_t workspace_bytes = 0;
-    ctcStatus_t status = platform::dynload::get_workspace_size(
-        cpu_label_lengths, cpu_input_lengths, static_cast<int>(sequence_width),
-        static_cast<int>(num_sequences), options_, &workspace_bytes);
-    PADDLE_ENFORCE_EQ(CTC_STATUS_SUCCESS, status,
-                      "warp-ctc [version %d] Error in get_workspace_size: ",
-                      warpctc_version_,
-                      platform::dynload::ctcGetStatusString(status));
-    PADDLE_ENFORCE_GT(workspace_bytes, 0UL,
-                      "Bytes of workspace got by warp-ctc function, "
-                      "get_workspace_size(), should be larger than 0.");
-
-    Tensor workspace;
-    size_t workspace_elements = workspace_bytes / sizeof(float) + 1UL;
-    float* workspace_data = workspace.mutable_data<float>(
-        framework::make_ddim({static_cast<int64_t>(workspace_elements)}),
-        ctx.GetPlace());
-    math::SetConstant<DeviceContext, float>()(
-        ctx.template device_context<DeviceContext>(), &workspace,
-        static_cast<float>(0));
-
-    // compute loss and gradient
-    status = platform::dynload::compute_ctc_loss(
-        input, gradient, cpu_labels, cpu_label_lengths, cpu_input_lengths,
-        static_cast<int>(sequence_width), static_cast<int>(num_sequences),
-        cpu_loss, workspace_data, options_);
-    PADDLE_ENFORCE_EQ(CTC_STATUS_SUCCESS, status,
-                      "warp-ctc [version %d] Error in compute_ctc_loss: ",
-                      warpctc_version_,
-                      platform::dynload::ctcGetStatusString(status));
-  }
-
- protected:
-  void init(const framework::ExecutionContext& ctx, const size_t blank) {
-    warpctc_version_ = platform::dynload::get_warpctc_version();
-
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-      options_.loc = CTC_GPU;
-      options_.stream = reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream();
-#else
-      PADDLE_THROW("[warpctc init] GPU is not enabled.");
-#endif
-    } else {
-      options_.loc = CTC_CPU;
-      options_.num_threads = 1;
-    }
-
-    options_.blank_label = blank;
-  }
-
- private:
-  int warpctc_version_;
-  ctcOptions options_;
-};
-
-template <typename DeviceContext, typename T>
-class WarpCTCKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* logits = ctx.Input<LoDTensor>("Logits");
-    auto* label = ctx.Input<LoDTensor>("Label");
-    auto* warpctc_grad = ctx.Output<Tensor>("WarpCTCGrad");
-    auto* loss = ctx.Output<Tensor>("Loss");
-
-    const size_t level = 0;
-
-    auto logits_lod = framework::ToAbsOffset(logits->lod());
-    auto logits_dims = logits->dims();
-    PADDLE_ENFORCE_EQ(logits_dims[0],
-                      static_cast<int64_t>(logits_lod[level].back()),
-                      "The first dimension of Input(Logits) should be equal to "
-                      "the sum of all sequences' lengths.");
-
-    auto label_lod = framework::ToAbsOffset(label->lod());
-    auto label_dims = label->dims();
-    PADDLE_ENFORCE_EQ(
-        label_dims[0], label->numel(),
-        "The width of each timestep in Input(Label) should be 1.");
-
-    const size_t num_sequences = logits_lod[level].size() - 1;
-    PADDLE_ENFORCE_EQ(num_sequences, label_lod[level].size() - 1,
-                      "The number of sequences of Input(Logits) should be "
-                      "equal to that of Input(Label).");
-
-    const size_t sequence_width = logits->numel() / logits_dims[0];
-    auto loss_dims =
-        framework::make_ddim({static_cast<int64_t>(num_sequences), 1});
-
-    // warpctc needs sequences data stored in transposed padding format
-    Tensor warpctc_logits;
-    const size_t max_sequence_length =
-        math::MaximumSequenceLength(logits_lod, level);
-    auto warpctc_logits_dims =
-        framework::make_ddim({static_cast<int64_t>(max_sequence_length),
-                              static_cast<int64_t>(num_sequences),
-                              static_cast<int64_t>(sequence_width)});
-    warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace());
-    math::PaddingLoDTensorFunctor<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), *logits, warpctc_logits,
-        false);
-    const T* warpctc_logits_data = warpctc_logits.data<T>();
-
-    std::vector<int> warpctc_label_lengths(num_sequences);
-    std::vector<int> warpctc_logits_lengths(num_sequences);
-
-    for (size_t i = 0; i < num_sequences; ++i) {
-      warpctc_label_lengths[i] = label_lod[level][i + 1] - label_lod[level][i];
-      warpctc_logits_lengths[i] =
-          logits_lod[level][i + 1] - logits_lod[level][i];
-    }
-
-    // warpctc computes loss and gradient in one call, gradient data also stored
-    // in batch format
-    T* warpctc_grad_data =
-        warpctc_grad->mutable_data<T>(warpctc_logits.dims(), ctx.GetPlace());
-
-    math::SetConstant<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), warpctc_grad,
-        static_cast<T>(0));
-
-    // warpctc accesses labels in CPU memory
-    Tensor warpctc_label;
-    Copy(*label, platform::CPUPlace(), ctx.device_context(), &warpctc_label);
-    const int* warpctc_label_data = warpctc_label.data<int>();
-    // warpctc stores loss in CPU memory
-    Tensor warpctc_loss;
-    T* warpctc_loss_data =
-        warpctc_loss.mutable_data<T>(loss_dims, platform::CPUPlace());
-
-    const size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
-
-    WarpCTCFunctor<DeviceContext>()(
-        ctx, warpctc_logits_data, warpctc_grad_data, warpctc_label_data,
-        warpctc_label_lengths.data(), warpctc_logits_lengths.data(),
-        sequence_width, num_sequences, blank, warpctc_loss_data);
-
-    // Copy the loss back
-    Copy(warpctc_loss, ctx.GetPlace(), ctx.device_context(), loss);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class WarpCTCGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* warpctc_grad = ctx.Input<Tensor>("WarpCTCGrad");
-    auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
-    const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-
-    logits_grad->mutable_data<T>(ctx.GetPlace());
-    bool norm_by_times = ctx.Attr<bool>("norm_by_times");
-    math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), *logits_grad,
-        *warpctc_grad, norm_by_times);
-
-    const T* loss_grad_data = loss_grad->data<T>();
-    math::ScaleLoDTensorFunctor<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), *logits_grad,
-        loss_grad_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
deleted file mode 100644
index a744ebd61595403ee495a2e2c9e84181422e92ff..0000000000000000000000000000000000000000
--- a/paddle/operators/while_op.cc
+++ /dev/null
@@ -1,352 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-#include "paddle/framework/executor.h"
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-#include "paddle/operators/detail/safe_ref.h"
-
-namespace paddle {
-namespace operators {
-
-using StepScopeVar = std::vector<framework::Scope *>;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr char kStepBlock[] = "sub_block";
-static constexpr char kCondition[] = "Condition";
-static constexpr char kStepScopes[] = "StepScopes";
-static constexpr char kX[] = "X";
-static constexpr char kXGRAD[] = "X@GRAD";
-static constexpr char kOutputs[] = "Out";
-
-class WhileOp : public framework::OperatorBase {
- public:
-  WhileOp(const std::string &type, const framework::VariableNameMap &inputs,
-          const framework::VariableNameMap &outputs,
-          const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
-    auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
-    PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
-
-    framework::Executor executor(dev_place);
-    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
-
-    auto *program = block->Program();
-
-    auto step_scopes =
-        scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
-
-    PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
-                   "Condition of while op must in CPU memory.");
-    while (cond.data<bool>()[0]) {
-      auto &current_scope = scope.NewScope();
-      step_scopes->push_back(&current_scope);
-
-      executor.Run(*program, &current_scope, block->ID(),
-                   false /*create_local_scope*/);
-    }
-  }
-};
-
-class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  WhileOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(kX,
-             "A set of variables, which are required by operators inside the "
-             "block of While Op.")
-        .AsDuplicable();
-    AddInput(
-        kCondition,
-        "(Bool) An scalar. When it's False, the While Op will be terminated.")
-        .AsDuplicable();
-    AddOutput(kOutputs,
-              "A set of variables, which will be assigned with values "
-              "generated by the operators inside the block of While Op.")
-        .AsDuplicable();
-    AddOutput(kStepScopes,
-              "(StepScopeVar) A vector of local scope, which size equals the "
-              "step number of While Op. The i'th scope storages temporary "
-              "variables generated in the i'th step.");
-    AddAttr<framework::BlockDesc *>(kStepBlock,
-                                    "The step block inside WhileOp");
-    AddComment(R"DOC(
-)DOC");
-  }
-};
-
-class WhileGradOp : public framework::OperatorBase {
- public:
-  WhileGradOp(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-    framework::Executor executor(dev_place);
-    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
-    auto *program = block->Program();
-
-    auto *step_scopes =
-        scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>();
-
-    auto outside_og_names = Inputs(framework::GradVarName(kOutputs));
-    auto inside_og_names =
-        Attr<std::vector<std::string>>("original_output_grad");
-
-    PADDLE_ENFORCE_EQ(outside_og_names.size(), inside_og_names.size());
-
-    for (auto cur_scope_iter = step_scopes->rbegin();
-         cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) {
-      VLOG(3) << "Start backward at time_step "
-              << cur_scope_iter - step_scopes->rbegin();
-      framework::Scope &cur_scope = **cur_scope_iter;
-      // Link OG from outside to inside
-      for (size_t i = 0; i < outside_og_names.size(); ++i) {
-        auto outside_og_name = outside_og_names[i];
-        auto inside_og_name = inside_og_names[i];
-        VLOG(8) << "Linking outside " << outside_og_name << " --> inside "
-                << inside_og_name;
-        auto &og_outside =
-            detail::Ref(scope.FindVar(outside_og_name),
-                        "Cannot find Outside Gradient %s", outside_og_name);
-        auto &og_inside =
-            detail::Ref(cur_scope.Var(inside_og_name),
-                        "Cannot find inside gradient %s", inside_og_name);
-        if (og_outside.Type().hash_code() ==
-            typeid(framework::LoDTensor).hash_code()) {
-          auto &outside_tensor = og_outside.Get<framework::LoDTensor>();
-          auto &inside_tensor =
-              detail::Ref(og_inside.GetMutable<framework::LoDTensor>());
-          inside_tensor.set_lod(outside_tensor.lod());
-          inside_tensor.ShareDataWith(outside_tensor);
-        } else if (og_outside.Type().hash_code() ==
-                   typeid(framework::LoDTensorArray).hash_code()) {
-          auto &outside_array = og_outside.Get<framework::LoDTensorArray>();
-          auto &inside_array =
-              detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
-          VLOG(8) << outside_og_name << " size = " << outside_array.size();
-          inside_array.resize(outside_array.size());
-
-          for (size_t j = 0; j < inside_array.size(); ++j) {
-            VLOG(8) << j << " " << outside_array[j].numel();
-            if (outside_array[j].numel() != 0) {
-              inside_array[j].set_lod(outside_array[j].lod());
-              inside_array[j].ShareDataWith(outside_array[j]);
-            } else {
-              PADDLE_ENFORCE_EQ(inside_array[j].numel(), 0);
-            }
-          }
-        }
-      }
-
-      executor.Run(*program, *cur_scope_iter, block->ID(), false);
-
-      auto &pg_names = Outputs(kXGRAD);
-      auto &p_names = Inputs(kX);
-      PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
-      for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
-        if (pg_names[param_id] == framework::kEmptyVarName) {
-          continue;  // parameter doesn't have gradient
-        }
-        auto inside_grad_name = framework::GradVarName(p_names[param_id]);
-
-        //  // TODO(tonyyang-svail): Not sure we need the following
-        //  // If does not compute gradient of that variable inside rnn,
-        //  just
-        //  // continue
-        //  if (local_var_names.find(inside_grad_name) ==
-        //  local_var_names.end()) {
-        //    continue;
-        //  }
-
-        // zero gradient variable in step 0
-        if (cur_scope_iter == step_scopes->rbegin()) {
-          auto *var = (*cur_scope_iter)->FindVar(inside_grad_name);
-          PADDLE_ENFORCE_NOT_NULL(var, "Can not find var %s", inside_grad_name);
-          if (var->IsType<LoDTensor>()) {
-            auto &inside_tensor = var->Get<framework::LoDTensor>();
-            framework::AttributeMap attrs;
-            attrs["dtype"] = framework::ToDataType(inside_tensor.type());
-            attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
-            attrs["value"] = 0.0f;
-
-            auto var_name = pg_names[param_id];
-            auto zero_op = framework::OpRegistry::CreateOp(
-                "fill_constant", framework::VariableNameMap{},
-                {{"Out", {var_name}}}, attrs);
-            zero_op->Run(scope, dev_place);
-            scope.FindVar(var_name)
-                ->GetMutable<framework::LoDTensor>()
-                ->set_lod(inside_tensor.lod());
-          }
-        }
-
-        auto new_inside_name = cur_scope.Rename(inside_grad_name);
-        auto sum_op = framework::OpRegistry::CreateOp(
-            "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-            {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
-        sum_op->Run(cur_scope, dev_place);
-        cur_scope.Rename(new_inside_name, inside_grad_name);
-      }
-      dev_ctx.Wait();
-      const_cast<framework::Scope &>(scope).DeleteScope(&cur_scope);
-    }
-  }
-};
-
-class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *while_grad = new framework::OpDesc();
-    while_grad->SetType("while_grad");
-    while_grad->SetInput(kX, Input(kX));
-    while_grad->SetInput(kOutputs, Output(kOutputs));
-    while_grad->SetInput(kStepScopes, Output(kStepScopes));
-
-    auto *grad_block = this->grad_block_[0];
-    auto *fwd_block = grad_block->ParentBlock();
-
-    // Not all of IGs will be generated by inner gradient operators of while op.
-    // Ignore IGs that is not generated by the inside block.
-    std::unordered_set<std::string> inner_op_outputs;
-    for (const auto *op : grad_block->AllOps()) {
-      for (auto &oname : op->OutputArgumentNames()) {
-        inner_op_outputs.insert(oname);
-      }
-    }
-    auto igs = InputGrad(kX, /*do not drop empty gradient*/ false);
-    for (auto &each_ig : igs) {
-      if (inner_op_outputs.find(each_ig) == inner_op_outputs.end()) {
-        VLOG(8) << "Ignore " << each_ig;
-        each_ig = framework::kEmptyVarName;
-      }
-    }
-    while_grad->SetOutput(framework::GradVarName(kX), igs);
-
-    // OG should be re-calculated by step blocks, since many outputs of while op
-    // do not need to calculate gradients.
-    std::unordered_set<std::string> block_ins;
-    block_ins.reserve(Input(kX).size() + Output(kOutputs).size());
-    for (auto &p : Input(kX)) {
-      block_ins.insert(p);
-    }
-    for (auto &o : Output(kOutputs)) {
-      block_ins.insert(o);
-    }
-    std::unordered_set<std::string> extra_inputs;
-    for (const auto *op : grad_block->AllOps()) {
-      for (auto &input_name : op->InputArgumentNames()) {
-        // If the input of Op has been recorded or is generated by the forward
-        // block, do not make it as input again.
-        if (block_ins.find(input_name) != block_ins.end() ||
-            fwd_block->FindVar(input_name) != nullptr) {
-          continue;
-        }
-        extra_inputs.insert(input_name);
-      }
-      for (auto &output_name : op->OutputArgumentNames()) {
-        block_ins.insert(output_name);
-      }
-    }
-
-    std::vector<std::string> extra_inputs_list;
-    extra_inputs_list.resize(extra_inputs.size());
-    std::copy(extra_inputs.begin(), extra_inputs.end(),
-              extra_inputs_list.begin());
-    while_grad->SetInput(framework::GradVarName(kOutputs), extra_inputs_list);
-
-    while_grad->SetAttrMap(this->Attrs());
-    while_grad->SetBlockAttr(kStepBlock, *grad_block);
-    // record the original output gradient names, since the gradient name of
-    // while operator could be renamed.
-    while_grad->SetAttr("original_output_grad", extra_inputs_list);
-
-    return std::unique_ptr<framework::OpDesc>(while_grad);
-  }
-};
-
-class WhileGradOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto p_names = op_desc.Input(kX);
-    auto pg_names = op_desc.Output(framework::GradVarName(kX));
-
-    for (size_t i = 0; i < p_names.size(); ++i) {
-      auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
-      auto *g_var = block->FindVarRecursive(pg_names[i]);
-      if (g_var != nullptr) {  // Gradient could be @EMPTY@
-        VLOG(5) << "Setting " << pg_names[i] << " following " << p_names[i]
-                << " type: " << p_var.GetType();
-        g_var->SetType(p_var.GetType());
-        g_var->SetDataType(p_var.GetDataType());
-      }
-    }
-  }
-};
-
-class WhileGradOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    ctx->HasInputs(kX);
-    ctx->HasOutputs(framework::GradVarName(kX));
-    ctx->HasInputs(kOutputs);
-    ctx->HasInputs(framework::GradVarName(kOutputs));
-
-    auto p_names = ctx->Inputs(kX);
-    auto pg_names = ctx->Outputs(kXGRAD);
-    auto var_types = ctx->GetInputsVarType(kX);
-    std::vector<std::string> names_to_set;
-    std::vector<framework::DDim> dims_to_set;
-    for (size_t i = 0; i < p_names.size(); ++i) {
-      if (pg_names[i] == framework::kEmptyVarName) {
-        continue;
-      }
-      auto dims = ctx->GetInputsElementDim(kX, i);
-      if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR) {
-        names_to_set.push_back(pg_names[i]);
-        dims_to_set.push_back(dims);
-      } else if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR_ARRAY) {
-        // not sure how to set the dim of LOD_TENSOR_ARRAY
-        names_to_set.push_back(pg_names[i]);
-        dims_to_set.push_back(dims);
-      }
-    }
-    ctx->SetDims(names_to_set, dims_to_set);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(while, paddle::operators::WhileOp,
-                  paddle::operators::WhileOpMaker,
-                  paddle::operators::WhileGradOpDescMaker);
-REGISTER_OPERATOR(while_grad, paddle::operators::WhileGradOp,
-                  paddle::operators::WhileGradOpShapeInference,
-                  paddle::operators::WhileGradOpVarTypeInference);
diff --git a/paddle/platform/cpu_info.cc b/paddle/platform/cpu_info.cc
deleted file mode 100644
index 78e1fa9df56b1623bfd9a53c6a37524d29648afc..0000000000000000000000000000000000000000
--- a/paddle/platform/cpu_info.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/platform/cpu_info.h"
-
-#ifdef __APPLE__
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#else
-#include <unistd.h>
-#endif
-
-#include "gflags/gflags.h"
-
-DEFINE_double(fraction_of_cpu_memory_to_use, 1,
-              "Default use 100% of CPU memory for PaddlePaddle,"
-              "reserve the rest for page tables, etc");
-
-namespace paddle {
-namespace platform {
-
-inline size_t CpuTotalPhysicalMemory() {
-#ifdef __APPLE__
-  int mib[2];
-  mib[0] = CTL_HW;
-  mib[1] = HW_MEMSIZE;
-  int64_t size = 0;
-  size_t len = sizeof(size);
-  if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
-  return 0L;
-#else
-  int64_t pages = sysconf(_SC_PHYS_PAGES);
-  int64_t page_size = sysconf(_SC_PAGE_SIZE);
-  return pages * page_size;
-#endif
-}
-
-size_t CpuMaxAllocSize() {
-  // For distributed systems, it requires configuring and limiting
-  // the fraction of memory to use.
-  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
-}
-
-size_t CpuMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 4 KB.
-  return 1 << 12;
-}
-
-size_t CpuMaxChunkSize() {
-  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
-  return CpuMaxAllocSize() / 32;
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/cpu_info_test.cc b/paddle/platform/cpu_info_test.cc
deleted file mode 100644
index 1bfe62c1fb667e17d7383cf0a1b2632043c72743..0000000000000000000000000000000000000000
--- a/paddle/platform/cpu_info_test.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/platform/cpu_info.h"
-#include "paddle/string/printf.h"
-
-#include <ostream>
-#include <sstream>
-
-#include "gflags/gflags.h"
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-
-DECLARE_double(fraction_of_cpu_memory_to_use);
-
-TEST(CpuMemoryUsage, Print) {
-  std::stringstream ss;
-  size_t memory_size = paddle::platform::CpuMaxAllocSize() / 1024 / 1024 / 1024;
-  float use_percent = FLAGS_fraction_of_cpu_memory_to_use * 100;
-
-  std::cout << paddle::string::Sprintf("\n%.2f %% of CPU Memory Usage: %d GB\n",
-                                       use_percent, memory_size)
-            << std::endl;
-}
diff --git a/paddle/platform/cudnn_helper.h b/paddle/platform/cudnn_helper.h
deleted file mode 100644
index 80a4c9bb4bbcd03cf849d86118db4e502382f031..0000000000000000000000000000000000000000
--- a/paddle/platform/cudnn_helper.h
+++ /dev/null
@@ -1,286 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/platform/dynload/cudnn.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/macros.h"
-
-namespace paddle {
-namespace platform {
-
-inline const char* cudnnGetErrorString(cudnnStatus_t status) {
-  switch (status) {
-    case CUDNN_STATUS_SUCCESS:
-      return "CUDNN_STATUS_SUCCESS";
-    case CUDNN_STATUS_NOT_INITIALIZED:
-      return "CUDNN_STATUS_NOT_INITIALIZED";
-    case CUDNN_STATUS_ALLOC_FAILED:
-      return "CUDNN_STATUS_ALLOC_FAILED";
-    case CUDNN_STATUS_BAD_PARAM:
-      return "CUDNN_STATUS_BAD_PARAM";
-    case CUDNN_STATUS_INTERNAL_ERROR:
-      return "CUDNN_STATUS_INTERNAL_ERROR";
-    case CUDNN_STATUS_INVALID_VALUE:
-      return "CUDNN_STATUS_INVALID_VALUE";
-    case CUDNN_STATUS_ARCH_MISMATCH:
-      return "CUDNN_STATUS_ARCH_MISMATCH";
-    case CUDNN_STATUS_MAPPING_ERROR:
-      return "CUDNN_STATUS_MAPPING_ERROR";
-    case CUDNN_STATUS_EXECUTION_FAILED:
-      return "CUDNN_STATUS_EXECUTION_FAILED";
-    case CUDNN_STATUS_NOT_SUPPORTED:
-      return "CUDNN_STATUS_NOT_SUPPORTED";
-    case CUDNN_STATUS_LICENSE_ERROR:
-      return "CUDNN_STATUS_LICENSE_ERROR";
-    default:
-      return "Unknown cudnn error number";
-  }
-}
-
-#define CUDNN_VERSION_MIN(major, minor, patch) \
-  (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
-
-#define CUDNN_ENFORCE(condition)                                  \
-  do {                                                            \
-    cudnnStatus_t status = condition;                             \
-    if (status != CUDNN_STATUS_SUCCESS) {                         \
-      VLOG(1) << ::paddle::platform::cudnnGetErrorString(status); \
-      PADDLE_THROW("cuDNN call failed");                          \
-    }                                                             \
-  } while (false)
-
-enum class DataLayout {  // Not use
-  kNHWC,
-  kNCHW,
-  kNCDHW,
-  kNCHW_VECT_C,
-};
-
-enum class PoolingMode {
-  kMaximum,
-  kAverage,
-};
-
-template <typename T>
-class CudnnDataType;
-
-template <>
-class CudnnDataType<float> {
- public:
-  static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
-  typedef const float ScalingParamType;
-  static ScalingParamType* kOne() {
-    static ScalingParamType v = 1.0;
-    return &v;
-  }
-  static ScalingParamType* kZero() {
-    static ScalingParamType v = 0.0;
-    return &v;
-  }
-};
-
-template <>
-class CudnnDataType<double> {
- public:
-  static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
-  typedef const double ScalingParamType;
-  static ScalingParamType* kOne() {
-    static ScalingParamType v = 1.0;
-    return &v;
-  }
-  static ScalingParamType* kZero() {
-    static ScalingParamType v = 0.0;
-    return &v;
-  }
-};
-
-inline cudnnTensorFormat_t GetCudnnTensorFormat(
-    const DataLayout& order) {  // Not use
-  switch (order) {
-    case DataLayout::kNHWC:
-      return CUDNN_TENSOR_NHWC;
-    case DataLayout::kNCHW:
-      return CUDNN_TENSOR_NCHW;
-    case DataLayout::kNCDHW:
-      return CUDNN_TENSOR_NCHW;  // NOTE: cudnn treat NdTensor as the same
-    default:
-      PADDLE_THROW("Unknown cudnn equivalent for order");
-  }
-  return CUDNN_TENSOR_NCHW;
-}
-
-class ScopedTensorDescriptor {
- public:
-  ScopedTensorDescriptor() {
-    PADDLE_ENFORCE(dynload::cudnnCreateTensorDescriptor(&desc_));
-  }
-  ~ScopedTensorDescriptor() {
-    PADDLE_ENFORCE(dynload::cudnnDestroyTensorDescriptor(desc_));
-  }
-
-  inline cudnnTensorDescriptor_t descriptor(const cudnnTensorFormat_t format,
-                                            const cudnnDataType_t type,
-                                            const std::vector<int>& dims,
-                                            const int groups = 1) {
-    // the format is not used now, will add later
-    std::vector<int> strides(dims.size());
-    strides[dims.size() - 1] = 1;
-    for (int i = dims.size() - 2; i >= 0; i--) {
-      strides[i] = dims[i + 1] * strides[i + 1];
-    }
-    // Update tensor descriptor dims setting if groups > 1
-    // NOTE: Assume using NCHW or NCDHW order
-    std::vector<int> dims_with_group(dims.begin(), dims.end());  // copy
-    if (groups > 1) {
-      dims_with_group[1] = dims_with_group[1] / groups;
-    }
-    PADDLE_ENFORCE(dynload::cudnnSetTensorNdDescriptor(
-        desc_, type, dims_with_group.size(), dims_with_group.data(),
-        strides.data()));
-    return desc_;
-  }
-
-  template <typename T>
-  inline cudnnTensorDescriptor_t descriptor(const DataLayout& order,
-                                            const std::vector<int>& dims,
-                                            const int groups = 1) {
-    return descriptor(GetCudnnTensorFormat(order), CudnnDataType<T>::type, dims,
-                      groups);
-  }
-
- private:
-  cudnnTensorDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor);
-};
-
-class ScopedFilterDescriptor {
- public:
-  ScopedFilterDescriptor() {
-    PADDLE_ENFORCE(dynload::cudnnCreateFilterDescriptor(&desc_));
-  }
-  ~ScopedFilterDescriptor() {
-    PADDLE_ENFORCE(dynload::cudnnDestroyFilterDescriptor(desc_));
-  }
-
-  inline cudnnFilterDescriptor_t descriptor(const cudnnTensorFormat_t format,
-                                            const cudnnDataType_t type,
-                                            const std::vector<int>& kernel,
-                                            const int groups = 1) {
-    // filter layout: MCHW(MCDHW), where M is the number of
-    // output image channels, C is the number of input image channels,
-    // D is the depth of the filter, H is the height of the filter, and W is the
-    // width of the filter.
-    std::vector<int> kernel_with_group(kernel.begin(), kernel.end());
-    if (groups > 1) {
-      kernel_with_group[0] /= groups;
-      // NOTE: input filter(C) of the filter is already asserted to be C/groups.
-    }
-    PADDLE_ENFORCE(dynload::cudnnSetFilterNdDescriptor(
-        desc_, type, format, kernel_with_group.size(),
-        kernel_with_group.data()));
-    return desc_;
-  }
-
-  template <typename T>
-  inline cudnnFilterDescriptor_t descriptor(const DataLayout& order,
-                                            const std::vector<int>& kernel,
-                                            const int groups = 1) {
-    return descriptor(GetCudnnTensorFormat(order), CudnnDataType<T>::type,
-                      kernel, groups);
-  }
-
- private:
-  cudnnFilterDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedFilterDescriptor);
-};
-
-class ScopedConvolutionDescriptor {
- public:
-  ScopedConvolutionDescriptor() {
-    PADDLE_ENFORCE(dynload::cudnnCreateConvolutionDescriptor(&desc_));
-  }
-  ~ScopedConvolutionDescriptor() {
-    PADDLE_ENFORCE(dynload::cudnnDestroyConvolutionDescriptor(desc_));
-  }
-
-  inline cudnnConvolutionDescriptor_t descriptor(
-      cudnnDataType_t type, const std::vector<int>& pads,
-      const std::vector<int>& strides, const std::vector<int>& dilations) {
-    PADDLE_ENFORCE_EQ(pads.size(), strides.size());
-    PADDLE_ENFORCE_EQ(pads.size(), dilations.size());
-
-#if !CUDNN_VERSION_MIN(6, 0, 0)
-    // cudnn v5 does not support dilation conv, the argument is called upscale
-    // instead of dilations and it is must be one.
-    for (size_t i = 0; i < dilations.size(); ++i) {
-      PADDLE_ENFORCE_EQ(
-          dilations[i], 1,
-          "Dilations conv is not supported in this cuDNN version(%d.%d.%d).",
-          CUDNN_VERSION / 1000, CUDNN_VERSION % 1000 / 100,
-          CUDNN_VERSION % 100);
-    }
-#endif
-
-    PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor(
-        desc_, pads.size(), pads.data(), strides.data(), dilations.data(),
-        CUDNN_CROSS_CORRELATION, type));
-    return desc_;
-  }
-
-  template <typename T>
-  inline cudnnConvolutionDescriptor_t descriptor(
-      const std::vector<int>& pads, const std::vector<int>& strides,
-      const std::vector<int>& dilations) {
-    return descriptor(CudnnDataType<T>::type, pads, strides, dilations);
-  }
-
- private:
-  cudnnConvolutionDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedConvolutionDescriptor);
-};
-
-class ScopedPoolingDescriptor {
- public:
-  ScopedPoolingDescriptor() {
-    PADDLE_ENFORCE(dynload::cudnnCreatePoolingDescriptor(&desc_));
-  }
-  ~ScopedPoolingDescriptor() {
-    PADDLE_ENFORCE(dynload::cudnnDestroyPoolingDescriptor(desc_));
-  }
-
-  inline cudnnPoolingDescriptor_t descriptor(const PoolingMode& mode,
-                                             const std::vector<int>& kernel,
-                                             const std::vector<int>& pads,
-                                             const std::vector<int>& strides) {
-    PADDLE_ENFORCE_EQ(kernel.size(), pads.size());
-    PADDLE_ENFORCE_EQ(kernel.size(), strides.size());
-    PADDLE_ENFORCE(dynload::cudnnSetPoolingNdDescriptor(
-        desc_, (mode == PoolingMode::kMaximum
-                    ? CUDNN_POOLING_MAX
-                    : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING),
-        CUDNN_PROPAGATE_NAN,  // Always propagate nans.
-        kernel.size(), kernel.data(), pads.data(), strides.data()));
-    return desc_;
-  }
-
- private:
-  cudnnPoolingDescriptor_t desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
-};
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/cudnn_helper_test.cc b/paddle/platform/cudnn_helper_test.cc
deleted file mode 100644
index 427359f69713b961c4730b697d3ccde5f7085838..0000000000000000000000000000000000000000
--- a/paddle/platform/cudnn_helper_test.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/platform/cudnn_helper.h"
-#include <gtest/gtest.h>
-
-TEST(CudnnHelper, ScopedTensorDescriptor) {
-  using paddle::platform::ScopedTensorDescriptor;
-  using paddle::platform::DataLayout;
-
-  ScopedTensorDescriptor tensor_desc;
-  std::vector<int> shape = {2, 4, 6, 6};
-  auto desc = tensor_desc.descriptor<float>(DataLayout::kNCHW, shape);
-
-  cudnnDataType_t type;
-  int nd;
-  std::vector<int> dims(4);
-  std::vector<int> strides(4);
-  paddle::platform::dynload::cudnnGetTensorNdDescriptor(
-      desc, 4, &type, &nd, dims.data(), strides.data());
-
-  EXPECT_EQ(nd, 4);
-  for (size_t i = 0; i < dims.size(); ++i) {
-    EXPECT_EQ(dims[i], shape[i]);
-  }
-  EXPECT_EQ(strides[3], 1);
-  EXPECT_EQ(strides[2], 6);
-  EXPECT_EQ(strides[1], 36);
-  EXPECT_EQ(strides[0], 144);
-
-  // test tensor5d: ScopedTensorDescriptor
-  ScopedTensorDescriptor tensor5d_desc;
-  std::vector<int> shape_5d = {2, 4, 6, 6, 6};
-  auto desc_5d = tensor5d_desc.descriptor<float>(DataLayout::kNCDHW, shape_5d);
-
-  std::vector<int> dims_5d(5);
-  std::vector<int> strides_5d(5);
-  paddle::platform::dynload::cudnnGetTensorNdDescriptor(
-      desc_5d, 5, &type, &nd, dims_5d.data(), strides_5d.data());
-
-  EXPECT_EQ(nd, 5);
-  for (size_t i = 0; i < dims_5d.size(); ++i) {
-    EXPECT_EQ(dims_5d[i], shape_5d[i]);
-  }
-  EXPECT_EQ(strides_5d[4], 1);
-  EXPECT_EQ(strides_5d[3], 6);
-  EXPECT_EQ(strides_5d[2], 36);
-  EXPECT_EQ(strides_5d[1], 216);
-  EXPECT_EQ(strides_5d[0], 864);
-}
-
-TEST(CudnnHelper, ScopedFilterDescriptor) {
-  using paddle::platform::ScopedFilterDescriptor;
-  using paddle::platform::DataLayout;
-
-  ScopedFilterDescriptor filter_desc;
-  std::vector<int> shape = {2, 3, 3};
-  auto desc = filter_desc.descriptor<float>(DataLayout::kNCHW, shape);
-
-  cudnnDataType_t type;
-  int nd;
-  cudnnTensorFormat_t format;
-  std::vector<int> kernel(3);
-  paddle::platform::dynload::cudnnGetFilterNdDescriptor(desc, 3, &type, &format,
-                                                        &nd, kernel.data());
-
-  EXPECT_EQ(GetCudnnTensorFormat(DataLayout::kNCHW), format);
-  EXPECT_EQ(nd, 3);
-  for (size_t i = 0; i < shape.size(); ++i) {
-    EXPECT_EQ(kernel[i], shape[i]);
-  }
-
-  ScopedFilterDescriptor filter_desc_4d;
-  std::vector<int> shape_4d = {2, 3, 3, 3};
-  auto desc_4d = filter_desc.descriptor<float>(DataLayout::kNCDHW, shape_4d);
-
-  std::vector<int> kernel_4d(4);
-  paddle::platform::dynload::cudnnGetFilterNdDescriptor(
-      desc_4d, 4, &type, &format, &nd, kernel_4d.data());
-
-  EXPECT_EQ(GetCudnnTensorFormat(DataLayout::kNCHW), format);
-  EXPECT_EQ(nd, 4);
-  for (size_t i = 0; i < shape_4d.size(); ++i) {
-    EXPECT_EQ(kernel_4d[i], shape_4d[i]);
-  }
-}
-
-TEST(CudnnHelper, ScopedConvolutionDescriptor) {
-  using paddle::platform::ScopedConvolutionDescriptor;
-
-  ScopedConvolutionDescriptor conv_desc;
-  std::vector<int> src_pads = {2, 2, 2};
-  std::vector<int> src_strides = {1, 1, 1};
-  std::vector<int> src_dilations = {1, 1, 1};
-  auto desc = conv_desc.descriptor<float>(src_pads, src_strides, src_dilations);
-
-  cudnnDataType_t type;
-  cudnnConvolutionMode_t mode;
-  int nd;
-  std::vector<int> pads(3);
-  std::vector<int> strides(3);
-  std::vector<int> dilations(3);
-  paddle::platform::dynload::cudnnGetConvolutionNdDescriptor(
-      desc, 3, &nd, pads.data(), strides.data(), dilations.data(), &mode,
-      &type);
-
-  EXPECT_EQ(nd, 3);
-  for (size_t i = 0; i < src_pads.size(); ++i) {
-    EXPECT_EQ(pads[i], src_pads[i]);
-    EXPECT_EQ(strides[i], src_strides[i]);
-    EXPECT_EQ(dilations[i], src_dilations[i]);
-  }
-  EXPECT_EQ(mode, CUDNN_CROSS_CORRELATION);
-}
-
-TEST(CudnnHelper, ScopedPoolingDescriptor) {
-  using paddle::platform::ScopedPoolingDescriptor;
-  using paddle::platform::PoolingMode;
-
-  ScopedPoolingDescriptor pool_desc;
-  std::vector<int> src_kernel = {2, 2, 5};
-  std::vector<int> src_pads = {1, 1, 2};
-  std::vector<int> src_strides = {2, 2, 3};
-  auto desc = pool_desc.descriptor(PoolingMode::kMaximum, src_kernel, src_pads,
-                                   src_strides);
-
-  cudnnPoolingMode_t mode;
-  cudnnNanPropagation_t nan_t = CUDNN_PROPAGATE_NAN;
-  int nd;
-  std::vector<int> kernel(3);
-  std::vector<int> pads(3);
-  std::vector<int> strides(3);
-  paddle::platform::dynload::cudnnGetPoolingNdDescriptor(
-      desc, 3, &mode, &nan_t, &nd, kernel.data(), pads.data(), strides.data());
-
-  EXPECT_EQ(nd, 3);
-  for (size_t i = 0; i < src_pads.size(); ++i) {
-    EXPECT_EQ(kernel[i], src_kernel[i]);
-    EXPECT_EQ(pads[i], src_pads[i]);
-    EXPECT_EQ(strides[i], src_strides[i]);
-  }
-  EXPECT_EQ(mode, CUDNN_POOLING_MAX);
-}
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
deleted file mode 100644
index 9d9348079a0179418ab1a1474cfd8b69136f26b2..0000000000000000000000000000000000000000
--- a/paddle/platform/device_context.cc
+++ /dev/null
@@ -1,236 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/platform/device_context.h"
-#include "paddle/memory/memory.h"
-
-namespace paddle {
-namespace platform {
-
-DeviceContextPool* DeviceContextPool::pool = nullptr;
-
-const platform::DeviceContext* DeviceContextPool::Get(
-    const platform::Place& place) {
-  auto it = device_contexts_.find(place);
-  if (it == device_contexts_.end()) {
-    PADDLE_THROW(
-        "'Place' is not supported, Please re-compile with WITH_GPU "
-        "option");
-  }
-  return it->second;
-}
-
-DeviceContextPool::DeviceContextPool(
-    const std::vector<platform::Place>& places) {
-  PADDLE_ENFORCE_GT(places.size(), 0);
-  for (size_t i = 0; i < places.size(); i++) {
-    if (platform::is_cpu_place(places[i])) {
-      device_contexts_.emplace(places[i],
-                               new platform::CPUDeviceContext(
-                                   boost::get<platform::CPUPlace>(places[i])));
-    } else if (platform::is_gpu_place(places[i])) {
-#ifdef PADDLE_WITH_CUDA
-      device_contexts_.emplace(places[i],
-                               new platform::CUDADeviceContext(
-                                   boost::get<platform::CUDAPlace>(places[i])));
-#else
-      PADDLE_THROW(
-          "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
-          "option");
-#endif
-    }
-  }
-}
-
-CPUDeviceContext::CPUDeviceContext() {
-  eigen_device_.reset(new Eigen::DefaultDevice());
-}
-
-CPUDeviceContext::CPUDeviceContext(CPUPlace place) : place_(place) {
-  eigen_device_.reset(new Eigen::DefaultDevice());
-}
-
-Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
-  return eigen_device_.get();
-}
-
-Place CPUDeviceContext::GetPlace() const { return place_; }
-
-#ifdef PADDLE_WITH_CUDA
-
-class EigenCudaStreamDevice : public Eigen::StreamInterface {
- public:
-  EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
-    Eigen::initializeDeviceProp();
-  }
-  ~EigenCudaStreamDevice() override {}
-
-  void Reinitialize(const cudaStream_t* cuda_stream, CUDAPlace place) {
-    stream_ = cuda_stream;
-    place_ = place;
-    device_prop_ = &Eigen::m_deviceProperties[place.device];
-  }
-
-  const cudaStream_t& stream() const override { return *stream_; }
-
-  const cudaDeviceProp& deviceProperties() const override {
-    return *device_prop_;
-  }
-
-  void* allocate(size_t num_bytes) const override {
-    return paddle::memory::Alloc(place_, num_bytes);
-  }
-
-  void deallocate(void* buffer) const override {
-    paddle::memory::Free(place_, buffer);
-  }
-
-  void* scratchpad() const override {
-    if (scratch_ == NULL) {
-      scratch_ = allocate(Eigen::kCudaScratchSize + sizeof(unsigned int));
-    }
-    return scratch_;
-  }
-
-  unsigned int* semaphore() const override {
-    if (semaphore_ == NULL) {
-      char* scratch =
-          static_cast<char*>(scratchpad()) + Eigen::kCudaScratchSize;
-      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
-      PADDLE_ENFORCE(
-          cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
-    }
-    return semaphore_;
-  }
-
- private:
-  CUDAPlace place_;
-  const cudaStream_t* stream_;         // not owned;
-  const cudaDeviceProp* device_prop_;  // not owned;
-  mutable void* scratch_;
-  mutable unsigned int* semaphore_;
-};
-
-CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
-  SetDeviceId(place_.device);
-  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
-  eigen_stream_.reset(new EigenCudaStreamDevice());
-  eigen_stream_->Reinitialize(&stream_, place);
-  eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
-  PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
-  PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
-  if (dynload::HasCUDNN()) {
-    PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
-    PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
-  } else {
-    cudnn_handle_ = nullptr;
-  }
-}
-
-CUDADeviceContext::~CUDADeviceContext() {
-  SetDeviceId(place_.device);
-  Wait();
-  PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
-  if (cudnn_handle_ != nullptr) {
-    PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
-  }
-  eigen_stream_.reset();
-  eigen_device_.reset();
-  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
-}
-
-Place CUDADeviceContext::GetPlace() const { return place_; }
-
-void CUDADeviceContext::Wait() const {
-  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-  PADDLE_ENFORCE(cudaGetLastError());
-}
-
-Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
-  return eigen_device_.get();
-}
-
-cublasHandle_t CUDADeviceContext::cublas_handle() const {
-  return cublas_handle_;
-}
-
-cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
-
-cudaStream_t CUDADeviceContext::stream() const { return stream_; }
-
-#endif
-
-#ifdef PADDLE_WITH_MKLDNN
-MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
-    : CPUDeviceContext(place), ready_(false) {
-  stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
-  engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
-}
-
-template <typename T>
-void MKLDNNDeviceContext::AddElement(const std::string& op_key,
-                                     const T& value) {
-  if (GetElement<T>(op_key)) {
-    return;
-  }
-  GetElementPool<T>().emplace(op_key, std::move(value));
-}
-
-template <typename T>
-const T& MKLDNNDeviceContext::GetElement(const std::string& op_key) const {
-  auto it = GetElementPool<T>().find(op_key);
-  return it == GetElementPool<T>().end() ? nullptr : it->second;
-}
-
-template <>
-const std::unordered_map<const std::string, const MKLDNNMemoryPtr,
-                         std::hash<std::string>>&
-MKLDNNDeviceContext::GetElementPool<MKLDNNMemoryPtr>() const {
-  return memory_pool_;
-}
-
-template <>
-const std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
-                         std::hash<std::string>>&
-MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitivePtr>() const {
-  return primitive_pool_;
-}
-
-template <>
-const std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
-                         std::hash<std::string>>&
-MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitiveDescPtr>() const {
-  return primitive_desc_pool_;
-}
-
-void MKLDNNDeviceContext::Execute(bool block) {
-  if (pipeline_.empty()) {
-    return;
-  }
-  ResetStream();
-  stream_->submit(pipeline_).wait(block);
-  ready_ = false;
-  pipeline_.clear();
-}
-
-void MKLDNNDeviceContext::ResetStream() {
-  if (ready_) {
-    return;
-  }
-  // TODO(TJ): change me when mkldnn have specific method to reset this state
-  stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
-  ready_ = true;
-}
-
-#endif
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
deleted file mode 100644
index 9826a642768a3346110a7872a726aba15eac1fb0..0000000000000000000000000000000000000000
--- a/paddle/platform/device_context.h
+++ /dev/null
@@ -1,210 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <unordered_map>
-
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/platform/dynload/cublas.h"
-#include "paddle/platform/dynload/cudnn.h"
-#include "paddle/platform/gpu_info.h"
-#define EIGEN_USE_GPU
-#endif
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/platform/mkldnn_helper.h"
-#endif
-
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/place.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-#include "glog/logging.h"
-
-namespace paddle {
-namespace platform {
-
-class DeviceContext {
- public:
-  virtual ~DeviceContext() {}
-  virtual Place GetPlace() const = 0;
-
-  virtual void Wait() const {}
-};
-
-class CPUDeviceContext : public DeviceContext {
- public:
-  CPUDeviceContext();
-  explicit CPUDeviceContext(CPUPlace place);
-
-  Eigen::DefaultDevice* eigen_device() const;
-
-  Place GetPlace() const override;
-
- private:
-  CPUPlace place_;
-  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
-};
-
-template <typename Place>
-struct DefaultDeviceContextType;
-
-template <>
-struct DefaultDeviceContextType<platform::CPUPlace> {
-  using TYPE = CPUDeviceContext;
-};
-
-#ifdef PADDLE_WITH_CUDA
-
-class EigenCudaStreamDevice;
-
-class CUDADeviceContext : public DeviceContext {
- public:
-  explicit CUDADeviceContext(CUDAPlace place);
-  virtual ~CUDADeviceContext();
-
-  /*! \brief  Wait for all operations completion in the stream. */
-  void Wait() const override;
-
-  /*! \brief  Return place in the device context. */
-  Place GetPlace() const override;
-
-  /*! \brief  Return eigen device in the device context. */
-  Eigen::GpuDevice* eigen_device() const;
-
-  /*! \brief  Return cublas handle in the device context. */
-  cublasHandle_t cublas_handle() const;
-
-  /*! \brief  Return cudnn  handle in the device context. */
-  cudnnHandle_t cudnn_handle() const;
-
-  /*! \brief  Return cuda stream in the device context. */
-  cudaStream_t stream() const;
-
- private:
-  CUDAPlace place_;
-
-  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
-  std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
-
-  cudaStream_t stream_;
-  cudnnHandle_t cudnn_handle_;
-  cublasHandle_t cublas_handle_;
-};
-
-template <>
-struct DefaultDeviceContextType<platform::CUDAPlace> {
-  using TYPE = CUDADeviceContext;
-};
-
-#endif
-
-#ifdef PADDLE_WITH_MKLDNN
-class MKLDNNDeviceContext : public CPUDeviceContext {
- public:
-  explicit MKLDNNDeviceContext(CPUPlace place);
-
-  /* \brief  Add new element: memory, primitive or primitive desc */
-  template <typename T>
-  void AddElement(const std::string& op_key, const T& value);
-
-  /* \brief  Get existed element: memory, primitive or primitive desc */
-  template <typename T>
-  const T& GetElement(const std::string& op_key) const;
-
-  /* \brief  Get element pool: memory, primitive or primitive desc pool */
-  template <typename T>
-  const std::unordered_map<const std::string, const T, std::hash<std::string>>&
-  GetElementPool() const;
-
-  /* \brief  Get the active engine */
-  const MKLDNNEngine& engine() const { return *engine_; }
-
-  /* \brief  Submit primitive to pipeline */
-  void Submit(const MKLDNNPrimitivePtr& p) { pipeline_.push_back(*p); }
-
-  /*! \brief  Execute all submitted primitives in pipeline */
-  void Execute(bool block = true);
-
- protected:
-  /*! \brief  Reset the stream to prepare next exectue */
-  void ResetStream();
-
- private:
-  std::unordered_map<const std::string, const MKLDNNMemoryPtr,
-                     std::hash<std::string>>
-      memory_pool_;
-  std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
-                     std::hash<std::string>>
-      primitive_pool_;
-  std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
-                     std::hash<std::string>>
-      primitive_desc_pool_;
-  std::vector<MKLDNNPrimitive> pipeline_;
-  MKLDNNStreamPtr stream_;
-  MKLDNNEnginePtr engine_;
-  bool ready_;
-};
-#endif
-
-/*! \brief device context pool singleton */
-class DeviceContextPool {
- public:
-  explicit DeviceContextPool(const std::vector<platform::Place>& places);
-
-  static DeviceContextPool& Instance() {
-    PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
-    return *pool;
-  }
-
-  /*! \brief  Create should only called by Init function */
-  static DeviceContextPool& Init(const std::vector<platform::Place>& places) {
-    if (pool == nullptr) {
-      pool = new DeviceContextPool(places);
-    }
-    return *pool;
-  }
-
-  /*! \brief  Return handle of single device context. */
-  const platform::DeviceContext* Get(const platform::Place& place);
-
-  template <typename Place>
-  const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
-      const Place& place) {
-    return reinterpret_cast<
-        const typename DefaultDeviceContextType<Place>::TYPE*>(Get(place));
-  }
-
-  size_t size() const { return device_contexts_.size(); }
-
- private:
-  static DeviceContextPool* pool;
-  constexpr static int LEFT_SHIFT = 8;
-  struct Hash {
-    std::hash<int> hash_;
-    size_t operator()(const platform::Place& place) const {
-      int pre_hash = place.which() << LEFT_SHIFT;
-      if (platform::is_gpu_place(place)) {
-        pre_hash += boost::get<platform::CUDAPlace>(place).GetDeviceId();
-      }
-      return hash_(pre_hash);
-    }
-  };
-  std::unordered_map<const platform::Place, const platform::DeviceContext*,
-                     Hash>
-      device_contexts_;
-  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
-};
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/device_context_test.cu b/paddle/platform/device_context_test.cu
deleted file mode 100644
index 767fe9b24a5d51630397a864c98928de52d21f31..0000000000000000000000000000000000000000
--- a/paddle/platform/device_context_test.cu
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gtest/gtest.h"
-#include "paddle/platform/device_context.h"
-
-#include "glog/logging.h"
-
-TEST(Device, Init) {
-  using paddle::platform::DeviceContext;
-  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::CUDAPlace;
-
-  int count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < count; i++) {
-    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
-    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
-    ASSERT_NE(nullptr, gpu_device);
-    delete device_context;
-  }
-}
-
-TEST(Device, CUDADeviceContext) {
-  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::CUDAPlace;
-
-  int count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < count; i++) {
-    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
-    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
-    ASSERT_NE(nullptr, gpu_device);
-    cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
-    ASSERT_NE(nullptr, cudnn_handle);
-    cublasHandle_t cublas_handle = device_context->cublas_handle();
-    ASSERT_NE(nullptr, cublas_handle);
-    ASSERT_NE(nullptr, device_context->stream());
-    delete device_context;
-  }
-}
-
-TEST(Device, DeviceContextPool) {
-  using paddle::platform::DeviceContextPool;
-  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::Place;
-  using paddle::platform::CPUPlace;
-  using paddle::platform::CUDAPlace;
-
-  DeviceContextPool& pool = DeviceContextPool::Instance();
-  auto cpu_dev_ctx1 = pool.Get(CPUPlace());
-  auto cpu_dev_ctx2 = pool.Get(CPUPlace());
-  ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1);
-
-  std::vector<Place> gpu_places;
-  int count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < count; ++i) {
-    auto dev_ctx = pool.Get(CUDAPlace(i));
-    ASSERT_NE(dev_ctx, nullptr);
-  }
-}
-
-int main(int argc, char** argv) {
-  std::vector<paddle::platform::Place> places;
-
-  places.emplace_back(paddle::platform::CPUPlace());
-  int count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < count; ++i) {
-    places.emplace_back(paddle::platform::CUDAPlace(i));
-  }
-
-  VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Init(places);
-
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/platform/dynload/cublas.cc b/paddle/platform/dynload/cublas.cc
deleted file mode 100644
index 6aca716657c5f629ce40e88c04dc25fbbc9b4f36..0000000000000000000000000000000000000000
--- a/paddle/platform/dynload/cublas.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/platform/dynload/cublas.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-std::once_flag cublas_dso_flag;
-void *cublas_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/dynload/cublas.h b/paddle/platform/dynload/cublas.h
deleted file mode 100644
index 61a22d9db3e07cbe6fbca0e0b09fedcba232ff6c..0000000000000000000000000000000000000000
--- a/paddle/platform/dynload/cublas.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cublas_v2.h>
-#include <dlfcn.h>
-#include <mutex>
-#include "paddle/platform/dynload/dynamic_loader.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-extern std::once_flag cublas_dso_flag;
-extern void *cublas_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cublas routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                    \
-  struct DynLoad__##__name {                                        \
-    template <typename... Args>                                     \
-    inline cublasStatus_t operator()(Args... args) {                \
-      typedef cublasStatus_t (*cublasFunc)(Args...);                \
-      std::call_once(cublas_dso_flag,                               \
-                     paddle::platform::dynload::GetCublasDsoHandle, \
-                     &cublas_dso_handle);                           \
-      void *p_##__name = dlsym(cublas_dso_handle, #__name);         \
-      return reinterpret_cast<cublasFunc>(p_##__name)(args...);     \
-    }                                                               \
-  };                                                                \
-  extern DynLoad__##__name __name
-#else
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)     \
-  struct DynLoad__##__name {                         \
-    template <typename... Args>                      \
-    inline cublasStatus_t operator()(Args... args) { \
-      return __name(args...);                        \
-    }                                                \
-  };                                                 \
-  extern DynLoad__##__name __name
-#endif
-
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
-  DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)
-
-#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
-  __macro(cublasSaxpy_v2);                \
-  __macro(cublasDaxpy_v2);                \
-  __macro(cublasSgemv_v2);                \
-  __macro(cublasDgemv_v2);                \
-  __macro(cublasSgemm_v2);                \
-  __macro(cublasDgemm_v2);                \
-  __macro(cublasSgeam_v2);                \
-  __macro(cublasDgeam_v2);                \
-  __macro(cublasCreate_v2);               \
-  __macro(cublasDestroy_v2);              \
-  __macro(cublasSetStream_v2);            \
-  __macro(cublasSetPointerMode_v2);       \
-  __macro(cublasGetPointerMode_v2);       \
-  __macro(cublasSgemmBatched);            \
-  __macro(cublasDgemmBatched);            \
-  __macro(cublasCgemmBatched);            \
-  __macro(cublasZgemmBatched);            \
-  __macro(cublasSgemmStridedBatched);     \
-  __macro(cublasDgemmStridedBatched);     \
-  __macro(cublasCgemmStridedBatched);     \
-  __macro(cublasZgemmStridedBatched);     \
-  __macro(cublasSgetrfBatched);           \
-  __macro(cublasSgetriBatched);           \
-  __macro(cublasDgetrfBatched);           \
-  __macro(cublasDgetriBatched)
-
-CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP);
-
-#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc
deleted file mode 100644
index 701f6240fef2e9df25472c0caf6177e7f1964cfd..0000000000000000000000000000000000000000
--- a/paddle/platform/dynload/cudnn.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/platform/dynload/cudnn.h"
-#include "paddle/platform/enforce.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-std::once_flag cudnn_dso_flag;
-void* cudnn_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP);
-CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
-CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_R5
-CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_R7
-CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
-#endif
-
-#ifdef PADDLE_USE_DSO
-bool HasCUDNN() {
-  std::call_once(cudnn_dso_flag, GetCUDNNDsoHandle, &cudnn_dso_handle);
-  return cudnn_dso_handle != nullptr;
-}
-
-void EnforceCUDNNLoaded(const char* fn_name) {
-  PADDLE_ENFORCE(cudnn_dso_handle != nullptr,
-                 "Cannot load cudnn shared library. Cannot invoke method %s",
-                 fn_name);
-}
-#else
-bool HasCUDNN() { return true; }
-#endif
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/dynload/cudnn.h b/paddle/platform/dynload/cudnn.h
deleted file mode 100644
index b92634794947f6979255d6241ff3a333f3771bfb..0000000000000000000000000000000000000000
--- a/paddle/platform/dynload/cudnn.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cudnn.h>
-#include <dlfcn.h>
-#include <mutex>
-#include "paddle/platform/dynload/dynamic_loader.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-extern std::once_flag cudnn_dso_flag;
-extern void* cudnn_dso_handle;
-extern bool HasCUDNN();
-
-#ifdef PADDLE_USE_DSO
-
-extern void EnforceCUDNNLoaded(const char* fn_name);
-#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                    \
-  struct DynLoad__##__name {                                       \
-    template <typename... Args>                                    \
-    auto operator()(Args... args) -> decltype(__name(args...)) {   \
-      using cudnn_func = decltype(__name(args...)) (*)(Args...);   \
-      std::call_once(cudnn_dso_flag,                               \
-                     paddle::platform::dynload::GetCUDNNDsoHandle, \
-                     &cudnn_dso_handle);                           \
-      EnforceCUDNNLoaded(#__name);                                 \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);         \
-      return reinterpret_cast<cudnn_func>(p_##__name)(args...);    \
-    }                                                              \
-  };                                                               \
-  extern struct DynLoad__##__name __name
-
-#else
-
-#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                  \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      return __name(args...);                                    \
-    }                                                            \
-  };                                                             \
-  extern DynLoad__##__name __name
-
-#endif
-
-/**
- * include all needed cudnn functions in HPPL
- * different cudnn version has different interfaces
- **/
-#define CUDNN_DNN_ROUTINE_EACH(__macro)             \
-  __macro(cudnnSetTensor4dDescriptor);              \
-  __macro(cudnnSetTensor4dDescriptorEx);            \
-  __macro(cudnnSetTensorNdDescriptor);              \
-  __macro(cudnnGetTensorNdDescriptor);              \
-  __macro(cudnnGetConvolutionNdForwardOutputDim);   \
-  __macro(cudnnGetConvolutionForwardAlgorithm);     \
-  __macro(cudnnCreateTensorDescriptor);             \
-  __macro(cudnnDestroyTensorDescriptor);            \
-  __macro(cudnnCreateFilterDescriptor);             \
-  __macro(cudnnSetFilter4dDescriptor);              \
-  __macro(cudnnSetFilterNdDescriptor);              \
-  __macro(cudnnGetFilterNdDescriptor);              \
-  __macro(cudnnSetPooling2dDescriptor);             \
-  __macro(cudnnSetPoolingNdDescriptor);             \
-  __macro(cudnnGetPoolingNdDescriptor);             \
-  __macro(cudnnDestroyFilterDescriptor);            \
-  __macro(cudnnCreateConvolutionDescriptor);        \
-  __macro(cudnnCreatePoolingDescriptor);            \
-  __macro(cudnnDestroyPoolingDescriptor);           \
-  __macro(cudnnSetConvolution2dDescriptor);         \
-  __macro(cudnnDestroyConvolutionDescriptor);       \
-  __macro(cudnnSetConvolutionNdDescriptor);         \
-  __macro(cudnnGetConvolutionNdDescriptor);         \
-  __macro(cudnnDeriveBNTensorDescriptor);           \
-  __macro(cudnnCreate);                             \
-  __macro(cudnnDestroy);                            \
-  __macro(cudnnSetStream);                          \
-  __macro(cudnnActivationForward);                  \
-  __macro(cudnnConvolutionForward);                 \
-  __macro(cudnnConvolutionBackwardBias);            \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize); \
-  __macro(cudnnTransformTensor);                    \
-  __macro(cudnnPoolingForward);                     \
-  __macro(cudnnPoolingBackward);                    \
-  __macro(cudnnSoftmaxBackward);                    \
-  __macro(cudnnSoftmaxForward);                     \
-  __macro(cudnnGetVersion);                         \
-  __macro(cudnnGetErrorString);
-CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-
-#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \
-  __macro(cudnnAddTensor);                 \
-  __macro(cudnnConvolutionBackwardData);   \
-  __macro(cudnnConvolutionBackwardFilter);
-CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-
-// APIs available after R3:
-#if CUDNN_VERSION >= 3000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)           \
-  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm);       \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm);     \
-  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-// APIs available after R4:
-#if CUDNN_VERSION >= 4007
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)    \
-  __macro(cudnnBatchNormalizationForwardTraining);  \
-  __macro(cudnnBatchNormalizationForwardInference); \
-  __macro(cudnnBatchNormalizationBackward);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-// APIs in R5
-#if CUDNN_VERSION >= 5000
-#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)  \
-  __macro(cudnnCreateActivationDescriptor); \
-  __macro(cudnnSetActivationDescriptor);    \
-  __macro(cudnnGetActivationDescriptor);    \
-  __macro(cudnnDestroyActivationDescriptor);
-CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-#if CUDNN_VERSION >= 7001
-#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
-  __macro(cudnnSetConvolutionGroupCount);
-CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/dynload/curand.cc b/paddle/platform/dynload/curand.cc
deleted file mode 100644
index d05dd88126bfee7278e553710a717b8f2eb02ae0..0000000000000000000000000000000000000000
--- a/paddle/platform/dynload/curand.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/platform/dynload/curand.h>
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-std::once_flag curand_dso_flag;
-void *curand_dso_handle;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/dynload/curand.h b/paddle/platform/dynload/curand.h
deleted file mode 100644
index 7bfe0778c78f6075ec8a284d478a1f9d5ee66ae9..0000000000000000000000000000000000000000
--- a/paddle/platform/dynload/curand.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <curand.h>
-#include <dlfcn.h>
-#include <mutex>
-#include "paddle/platform/dynload/dynamic_loader.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-extern std::once_flag curand_dso_flag;
-extern void *curand_dso_handle;
-#ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                    \
-  struct DynLoad__##__name {                                        \
-    template <typename... Args>                                     \
-    curandStatus_t operator()(Args... args) {                       \
-      typedef curandStatus_t (*curandFunc)(Args...);                \
-      std::call_once(curand_dso_flag,                               \
-                     paddle::platform::dynload::GetCurandDsoHandle, \
-                     &curand_dso_handle);                           \
-      void *p_##__name = dlsym(curand_dso_handle, #__name);         \
-      return reinterpret_cast<curandFunc>(p_##__name)(args...);     \
-    }                                                               \
-  };                                                                \
-  extern DynLoad__##__name __name
-#else
-#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
-  struct DynLoad__##__name {                     \
-    template <typename... Args>                  \
-    curandStatus_t operator()(Args... args) {    \
-      return __name(args...);                    \
-    }                                            \
-  };                                             \
-  extern DynLoad__##__name __name
-#endif
-
-#define CURAND_RAND_ROUTINE_EACH(__macro)      \
-  __macro(curandCreateGenerator);              \
-  __macro(curandSetStream);                    \
-  __macro(curandSetPseudoRandomGeneratorSeed); \
-  __macro(curandGenerateUniform);              \
-  __macro(curandGenerateUniformDouble);        \
-  __macro(curandGenerateNormal);               \
-  __macro(curandDestroyGenerator);
-
-CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/dynload/dynamic_loader.cc b/paddle/platform/dynload/dynamic_loader.cc
deleted file mode 100644
index c8c09ae608fa7cc67a54fada9cfe86b40096a9fd..0000000000000000000000000000000000000000
--- a/paddle/platform/dynload/dynamic_loader.cc
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/platform/dynload/dynamic_loader.h"
-#include <dlfcn.h>
-#include <memory>
-#include <mutex>
-#include <string>
-#include "gflags/gflags.h"
-#include "glog/logging.h"
-#include "paddle/platform/enforce.h"
-
-DEFINE_string(cudnn_dir, "",
-              "Specify path for loading libcudnn.so. For instance, "
-              "/usr/local/cudnn/lib. If empty [default], dlopen "
-              "will search cudnn from LD_LIBRARY_PATH");
-
-DEFINE_string(cuda_dir, "",
-              "Specify path for loading cuda library, such as libcublas, "
-              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
-              "dlopen will search cuda from LD_LIBRARY_PATH");
-
-DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
-
-DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
-
-DEFINE_string(nccl_dir, "",
-              "Specify path for loading nccl library, such as libcublas, "
-              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
-              "dlopen will search cuda from LD_LIBRARY_PATH");
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-static inline std::string join(const std::string& part1,
-                               const std::string& part2) {
-  // directory separator
-  const char sep = '/';
-  if (!part2.empty() && part2.front() == sep) {
-    return part2;
-  }
-  std::string ret;
-  ret.reserve(part1.size() + part2.size() + 1);
-  ret = part1;
-  if (!ret.empty() && ret.back() != sep) {
-    ret += sep;
-  }
-  ret += part2;
-  return ret;
-}
-
-static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
-                                               void** dso_handle,
-                                               int dynload_flags) {
-  VLOG(3) << "Try to find library: " << dso_path
-          << " from default system path.";
-  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
-  *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-
-// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
-// bring System Integrity Projection (SIP), if dso_handle
-// is null, search from default package path in Mac OS.
-#if defined(__APPLE__) || defined(__OSX__)
-  if (nullptr == *dso_handle) {
-    dso_path = join("/usr/local/cuda/lib/", dso_path);
-    *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-    if (nullptr == *dso_handle) {
-      if (dso_path == "libcudnn.dylib") {
-        LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
-                        "For instance, sudo tar -xzf "
-                        "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
-                        "chmod a+r /usr/local/cuda/include/cudnn.h "
-                        "/usr/local/cuda/lib/libcudnn*";
-      }
-    }
-  }
-#endif
-}
-
-static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
-                                              const std::string& dso_name,
-                                              void** dso_handle,
-                                              bool throw_on_error = true) {
-  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
-  *dso_handle = nullptr;
-
-  std::string dlPath = dso_name;
-  if (search_root.empty()) {
-    GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
-  } else {
-    // search xxx.so from custom path
-    dlPath = join(search_root, dso_name);
-    *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
-    // if not found, search from default path
-    if (nullptr == *dso_handle) {
-      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
-                   << dlerror() << ")";
-      dlPath = dso_name;
-      GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
-    }
-  }
-  auto error_msg =
-      "Failed to find dynamic library: %s ( %s ) \n Please specify "
-      "its path correctly using following ways: \n Method. set "
-      "environment variable LD_LIBRARY_PATH on Linux or "
-      "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
-      "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
-      "using the DYLD_LIBRARY_PATH is impossible unless System "
-      "Integrity Protection (SIP) is disabled.";
-  if (throw_on_error) {
-    PADDLE_ENFORCE(nullptr != *dso_handle, error_msg, dlPath, dlerror());
-  } else if (nullptr == *dso_handle) {
-    LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror());
-  }
-}
-
-void GetCublasDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
-#endif
-}
-
-void GetCUDNNDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle,
-                             false);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle, false);
-#endif
-}
-
-void GetCurandDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
-#endif
-}
-
-void GetWarpCTCDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
-#endif
-}
-
-void GetLapackDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
-#endif
-}
-
-void GetNCCLDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", dso_handle);
-#endif
-}
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/dynload/nccl.cc b/paddle/platform/dynload/nccl.cc
deleted file mode 100644
index 4cec829a8ad8994d4a7643613331881e3a397b9a..0000000000000000000000000000000000000000
--- a/paddle/platform/dynload/nccl.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/platform/dynload/nccl.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-std::once_flag nccl_dso_flag;
-void *nccl_dso_handle;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
-
-void LoadNCCLDSO() {
-  platform::call_once(nccl_dso_flag,
-                      [] { GetNCCLDsoHandle(&nccl_dso_handle); });
-}
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/dynload/nccl.h b/paddle/platform/dynload/nccl.h
deleted file mode 100644
index 6c776afc97a53c964f0bc2b2a8abf2c29f474d3f..0000000000000000000000000000000000000000
--- a/paddle/platform/dynload/nccl.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <dlfcn.h>
-#include <nccl.h>
-#include <mutex>
-#include "paddle/platform/call_once.h"
-#include "paddle/platform/dynload/dynamic_loader.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-extern std::once_flag nccl_dso_flag;
-extern void* nccl_dso_handle;
-
-#ifdef PADDLE_USE_DSO
-extern void LoadNCCLDSO();
-
-#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                   \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      using nccl_func = decltype(__name(args...)) (*)(Args...);  \
-      paddle::platform::dynload::LoadNCCLDSO();                  \
-      void* p_##__name = dlsym(nccl_dso_handle, #__name);        \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);   \
-    }                                                            \
-  };                                                             \
-  extern DynLoad__##__name __name
-#else
-#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
-  struct DynLoad__##__name {                   \
-    template <typename... Args>                \
-    ncclResult_t operator()(Args... args) {    \
-      return __name(args...);                  \
-    }                                          \
-  };                                           \
-  extern DynLoad__##__name __name
-#endif
-
-#define NCCL_RAND_ROUTINE_EACH(__macro) \
-  __macro(ncclCommInitAll);             \
-  __macro(ncclGetUniqueId);             \
-  __macro(ncclCommInitRank);            \
-  __macro(ncclCommDestroy);             \
-  __macro(ncclCommCount);               \
-  __macro(ncclCommCuDevice);            \
-  __macro(ncclCommUserRank);            \
-  __macro(ncclAllReduce);               \
-  __macro(ncclBcast);                   \
-  __macro(ncclAllGather);               \
-  __macro(ncclGroupStart);              \
-  __macro(ncclGroupEnd);                \
-  __macro(ncclReduce);                  \
-  __macro(ncclGetErrorString);
-
-NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/dynload/warpctc.cc b/paddle/platform/dynload/warpctc.cc
deleted file mode 100644
index 9b7d01a6e8f6cccc0082f65f9511085d2a3111b7..0000000000000000000000000000000000000000
--- a/paddle/platform/dynload/warpctc.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/platform/dynload/warpctc.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-std::once_flag warpctc_dso_flag;
-void* warpctc_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-WARPCTC_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/dynload/warpctc.h b/paddle/platform/dynload/warpctc.h
deleted file mode 100644
index acafcaff2ccc33c315f216e31b866bf4c8ae1fec..0000000000000000000000000000000000000000
--- a/paddle/platform/dynload/warpctc.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <dlfcn.h>
-#include <mutex>
-#include "ctc.h"
-#include "paddle/platform/dynload/dynamic_loader.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-extern std::once_flag warpctc_dso_flag;
-extern void* warpctc_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load warpctc routine
- * via operator overloading.
- */
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                            \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    auto operator()(Args... args) -> decltype(__name(args...)) {     \
-      using warpctcFunc = decltype(__name(args...)) (*)(Args...);    \
-      std::call_once(warpctc_dso_flag,                               \
-                     paddle::platform::dynload::GetWarpCTCDsoHandle, \
-                     &warpctc_dso_handle);                           \
-      void* p_##_name = dlsym(warpctc_dso_handle, #__name);          \
-      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);      \
-    }                                                                \
-  };                                                                 \
-  extern DynLoad__##__name __name
-
-#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
-  DYNAMIC_LOAD_WARPCTC_WRAP(__name)
-
-#define WARPCTC_ROUTINE_EACH(__macro) \
-  __macro(get_warpctc_version);       \
-  __macro(ctcGetStatusString);        \
-  __macro(compute_ctc_loss);          \
-  __macro(get_workspace_size)
-
-WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP);
-
-#undef DYNAMIC_LOAD_WARPCTC_WRAP
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/enforce.cc b/paddle/platform/enforce.cc
deleted file mode 100644
index e8d31bc782ec3cddd18ceaedf88fe5e7b4aed2cc..0000000000000000000000000000000000000000
--- a/paddle/platform/enforce.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/platform/enforce.h"
-
-namespace paddle {
-namespace platform {}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
deleted file mode 100644
index d1c7be0790b5e11d6273efe6c08cdb7bf22425c6..0000000000000000000000000000000000000000
--- a/paddle/platform/enforce.h
+++ /dev/null
@@ -1,258 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <dlfcn.h>     // for dladdr
-#include <execinfo.h>  // for backtrace
-#include <iomanip>
-#include <memory>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-
-#include "paddle/platform/macros.h"
-#include "paddle/string/printf.h"
-#include "paddle/string/to_string.h"
-
-#ifdef __GNUC__
-#include <cxxabi.h>  // for __cxa_demangle
-#endif
-
-#include <glog/logging.h>
-
-#ifdef PADDLE_WITH_CUDA
-
-#include "paddle/platform/dynload/cublas.h"
-#include "paddle/platform/dynload/cudnn.h"
-#include "paddle/platform/dynload/curand.h"
-#include "paddle/platform/dynload/nccl.h"
-
-#include <cublas_v2.h>
-#include <cudnn.h>
-#include <curand.h>
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
-
-#endif
-
-namespace paddle {
-namespace platform {
-
-#ifdef __GNUC__
-inline std::string demangle(std::string name) {
-  int status = -4;  // some arbitrary value to eliminate the compiler warning
-  std::unique_ptr<char, void (*)(void*)> res{
-      abi::__cxa_demangle(name.c_str(), NULL, NULL, &status), std::free};
-  return (status == 0) ? res.get() : name;
-}
-#else
-inline std::string demangle(std::string name) { return name; }
-#endif
-
-struct EnforceNotMet : public std::exception {
-  std::exception_ptr exp_;
-  std::string err_str_;
-  EnforceNotMet(std::exception_ptr e, const char* f, int l) : exp_(e) {
-    static constexpr int TRACE_STACK_LIMIT = 100;
-    try {
-      std::rethrow_exception(exp_);
-    } catch (const std::exception& exp) {
-      std::ostringstream sout;
-
-      sout << string::Sprintf("%s at [%s:%d]", exp.what(), f, l) << std::endl;
-      sout << "PaddlePaddle Call Stacks: " << std::endl;
-
-      void* call_stack[TRACE_STACK_LIMIT];
-      auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
-      auto symbols = backtrace_symbols(call_stack, size);
-
-      Dl_info info;
-      for (int i = 0; i < size; ++i) {
-        if (dladdr(call_stack[i], &info) && info.dli_sname) {
-          auto demangled = demangle(info.dli_sname);
-          auto addr_offset = static_cast<char*>(call_stack[i]) -
-                             static_cast<char*>(info.dli_saddr);
-          sout << string::Sprintf("%-3d %*0p %s + %zd\n", i,
-                                  2 + sizeof(void*) * 2, call_stack[i],
-                                  demangled, addr_offset);
-        } else {
-          sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
-                                  call_stack[i]);
-        }
-      }
-      free(symbols);
-      err_str_ = sout.str();
-    }
-  }
-
-  const char* what() const noexcept { return err_str_.c_str(); }
-};
-
-// Because most enforce conditions would evaluate to true, we can use
-// __builtin_expect to instruct the C++ compiler to generate code that
-// always forces branch prediction of true.
-// This generates faster binary code. __builtin_expect is since C++11.
-// For more details, please check https://stackoverflow.com/a/43870188/724872.
-#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
-
-template <typename... Args>
-inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
-    bool stat, const Args&... args) {
-  if (UNLIKELY(!(stat))) {
-    throw std::runtime_error(string::Sprintf(args...));
-  }
-}
-
-#ifdef PADDLE_WITH_CUDA
-
-template <typename... Args>
-inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
-    cudaError_t e, const Args&... args) {
-  if (UNLIKELY(e)) {
-    throw thrust::system_error(e, thrust::cuda_category(),
-                               string::Sprintf(args...));
-  }
-}
-
-template <typename... Args>
-inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
-    curandStatus_t stat, const Args&... args) {
-  if (stat != CURAND_STATUS_SUCCESS) {
-    throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
-                               string::Sprintf(args...));
-  }
-}
-
-template <typename... Args>
-inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
-    cudnnStatus_t stat, const Args&... args) {
-  if (stat == CUDNN_STATUS_SUCCESS) {
-    return;
-  } else {
-    throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) +
-                             string::Sprintf(args...));
-  }
-}
-
-template <typename... Args>
-inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
-    cublasStatus_t stat, const Args&... args) {
-  std::string err;
-  if (stat == CUBLAS_STATUS_SUCCESS) {
-    return;
-  } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
-    err = "CUBLAS: not initialized, ";
-  } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) {
-    err = "CUBLAS: alloc failed, ";
-  } else if (stat == CUBLAS_STATUS_INVALID_VALUE) {
-    err = "CUBLAS: invalid value, ";
-  } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) {
-    err = "CUBLAS: arch mismatch, ";
-  } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) {
-    err = "CUBLAS: mapping error, ";
-  } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) {
-    err = "CUBLAS: execution failed, ";
-  } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) {
-    err = "CUBLAS: internal error, ";
-  } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) {
-    err = "CUBLAS: not supported, ";
-  } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
-    err = "CUBLAS: license error, ";
-  }
-  throw std::runtime_error(err + string::Sprintf(args...));
-}
-
-template <typename... Args>
-inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
-    ncclResult_t stat, const Args&... args) {
-  if (stat == ncclSuccess) {
-    return;
-  } else {
-    throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) +
-                             string::Sprintf(args...));
-  }
-}
-
-#endif  // PADDLE_ONLY_CPU
-
-template <typename T>
-inline void throw_on_error(T e) {
-  throw_on_error(e, "");
-}
-
-#define PADDLE_THROW(...)                                              \
-  do {                                                                 \
-    throw ::paddle::platform::EnforceNotMet(                           \
-        std::make_exception_ptr(                                       \
-            std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \
-        __FILE__, __LINE__);                                           \
-  } while (false)
-
-#define PADDLE_ENFORCE(...)                                             \
-  do {                                                                  \
-    try {                                                               \
-      ::paddle::platform::throw_on_error(__VA_ARGS__);                  \
-    } catch (...) {                                                     \
-      throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
-                                              __FILE__, __LINE__);      \
-    }                                                                   \
-  } while (false)
-
-/*
- * Some enforce helpers here, usage:
- *    int a = 1;
- *    int b = 2;
- *    PADDLE_ENFORCE_EQ(a, b);
- *
- *    will raise an expression described as follows:
- *    "enforce a == b failed, 1 != 2" with detailed stack information.
- *
- *    extra messages is also supported, for example:
- *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
- */
-
-#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
-#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__)
-#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__)
-#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__)
-#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
-#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
-#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                  \
-  do {                                                       \
-    if (UNLIKELY(nullptr == (__VAL))) {                      \
-      PADDLE_THROW(#__VAL " should not be null\n%s",         \
-                   paddle::string::Sprintf("" __VA_ARGS__)); \
-    }                                                        \
-  } while (0)
-
-#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)  \
-  do {                                                                  \
-    if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) {                           \
-      PADDLE_THROW("enforce %s " #__CMP " %s failed, %s " #__INV_CMP    \
-                   " %s\n%s",                                           \
-                   #__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \
-                   paddle::string::to_string(__VAL1),                   \
-                   paddle::string::Sprintf("" __VA_ARGS__));            \
-    }                                                                   \
-  } while (0)
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/enforce_test.cc b/paddle/platform/enforce_test.cc
deleted file mode 100644
index 8206a055eabf4abf584962e921610d5029e2f571..0000000000000000000000000000000000000000
--- a/paddle/platform/enforce_test.cc
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <array>
-#include <iostream>
-#include <memory>
-
-#include "gtest/gtest.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/string/piece.h"
-
-using StringPiece = paddle::string::Piece;
-using paddle::string::HasPrefix;
-
-TEST(ENFORCE, OK) {
-  PADDLE_ENFORCE(true, "Enforce is ok %d now %f", 123, 0.345);
-  size_t val = 1;
-  const size_t limit = 10;
-  PADDLE_ENFORCE(val < limit, "Enforce is OK too");
-}
-
-TEST(ENFORCE, FAILED) {
-  bool caught_exception = false;
-  try {
-    PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123);
-  } catch (paddle::platform::EnforceNotMet error) {
-    caught_exception = true;
-    EXPECT_TRUE(
-        HasPrefix(StringPiece(error.what()), "Enforce is not ok 123 at all"));
-  }
-  EXPECT_TRUE(caught_exception);
-}
-
-TEST(ENFORCE, NO_ARG_OK) {
-  int a = 2;
-  int b = 2;
-  PADDLE_ENFORCE_EQ(a, b);
-  // test enforce with extra message.
-  PADDLE_ENFORCE_EQ(a, b, "some thing wrong %s", "info");
-}
-
-TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) {
-  int a = 2;
-  bool caught_exception = false;
-  try {
-    PADDLE_ENFORCE_EQ(a, 1 + 3);
-  } catch (paddle::platform::EnforceNotMet error) {
-    caught_exception = true;
-    HasPrefix(StringPiece(error.what()), "enforce a == 1 + 3 failed, 2 != 4");
-  }
-  EXPECT_TRUE(caught_exception);
-}
-
-TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
-  int a = 2;
-  bool caught_exception = false;
-  try {
-    PADDLE_ENFORCE_EQ(a, 1 + 3, "%s size not match", "their");
-  } catch (paddle::platform::EnforceNotMet error) {
-    caught_exception = true;
-    HasPrefix(StringPiece(error.what()),
-              "enforce a == 1 + 3 failed, 2 != 4\ntheir size not match");
-  }
-  EXPECT_TRUE(caught_exception);
-}
-
-TEST(ENFORCE_NE, OK) {
-  PADDLE_ENFORCE_NE(1, 2);
-  PADDLE_ENFORCE_NE(1.0, 2UL);
-}
-TEST(ENFORCE_NE, FAIL) {
-  bool caught_exception = false;
-
-  try {
-    // 2UL here to check data type compatible
-    PADDLE_ENFORCE_NE(1.0, 1UL);
-  } catch (paddle::platform::EnforceNotMet error) {
-    caught_exception = true;
-    EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
-                          "enforce 1.0 != 1UL failed, 1 == 1"))
-        << error.what() << " does not have expected prefix";
-  }
-  EXPECT_TRUE(caught_exception);
-}
-
-TEST(ENFORCE_GT, OK) { PADDLE_ENFORCE_GT(2, 1); }
-TEST(ENFORCE_GT, FAIL) {
-  bool caught_exception = false;
-  try {
-    PADDLE_ENFORCE_GT(1, 2UL);
-
-  } catch (paddle::platform::EnforceNotMet error) {
-    caught_exception = true;
-    EXPECT_TRUE(
-        HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2"));
-  }
-  EXPECT_TRUE(caught_exception);
-}
-
-TEST(ENFORCE_GE, OK) {
-  PADDLE_ENFORCE_GE(2, 2UL);
-  PADDLE_ENFORCE_GE(3, 2UL);
-  PADDLE_ENFORCE_GE(3, 2);
-  PADDLE_ENFORCE_GE(3.21, 2UL);
-}
-TEST(ENFORCE_GE, FAIL) {
-  bool caught_exception = false;
-  try {
-    PADDLE_ENFORCE_GE(1, 2UL);
-
-  } catch (paddle::platform::EnforceNotMet error) {
-    caught_exception = true;
-    EXPECT_TRUE(
-        HasPrefix(StringPiece(error.what()), "enforce 1 >= 2UL failed, 1 < 2"));
-  }
-  EXPECT_TRUE(caught_exception);
-}
-
-TEST(ENFORCE_LE, OK) {
-  PADDLE_ENFORCE_LE(1, 1);
-  PADDLE_ENFORCE_LE(1, 1UL);
-  PADDLE_ENFORCE_LE(2, 3UL);
-  PADDLE_ENFORCE_LE(2UL, 3);
-  PADDLE_ENFORCE_LE(2UL, 3.2);
-}
-TEST(ENFORCE_LE, FAIL) {
-  bool caught_exception = false;
-  try {
-    PADDLE_ENFORCE_GT(1, 2UL);
-
-  } catch (paddle::platform::EnforceNotMet error) {
-    caught_exception = true;
-    EXPECT_TRUE(
-        HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2"));
-  }
-  EXPECT_TRUE(caught_exception);
-}
-
-TEST(ENFORCE_LT, OK) {
-  PADDLE_ENFORCE_LT(3, 10);
-  PADDLE_ENFORCE_LT(2, 3UL);
-  PADDLE_ENFORCE_LT(2UL, 3);
-}
-TEST(ENFORCE_LT, FAIL) {
-  bool caught_exception = false;
-  try {
-    PADDLE_ENFORCE_LT(1UL, 0.12);
-  } catch (paddle::platform::EnforceNotMet error) {
-    caught_exception = true;
-    EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
-                          "enforce 1UL < 0.12 failed, 1 >= 0.12"));
-  }
-  EXPECT_TRUE(caught_exception);
-}
-
-TEST(ENFORCE_NOT_NULL, OK) {
-  int* a = new int;
-  PADDLE_ENFORCE_NOT_NULL(a);
-  delete a;
-}
-TEST(ENFORCE_NOT_NULL, FAIL) {
-  bool caught_exception = false;
-  try {
-    int* a = nullptr;
-    PADDLE_ENFORCE_NOT_NULL(a);
-
-  } catch (paddle::platform::EnforceNotMet error) {
-    caught_exception = true;
-    EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "a should not be null"));
-  }
-  EXPECT_TRUE(caught_exception);
-}
-
-struct Dims {
-  size_t dims_[4];
-
-  bool operator==(const Dims& o) const {
-    for (size_t i = 0; i < 4; ++i) {
-      if (dims_[i] != o.dims_[i]) return false;
-    }
-    return true;
-  }
-};
-
-std::ostream& operator<<(std::ostream& os, const Dims& d) {
-  for (size_t i = 0; i < 4; ++i) {
-    if (i == 0) {
-      os << "[";
-    }
-    os << d.dims_[i];
-    if (i == 4 - 1) {
-      os << "]";
-    } else {
-      os << ", ";
-    }
-  }
-  return os;
-}
-
-TEST(ENFORCE_USER_DEFINED_CLASS, EQ) {
-  Dims a{{1, 2, 3, 4}}, b{{1, 2, 3, 4}};
-  PADDLE_ENFORCE_EQ(a, b);
-}
-
-TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
-  Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
-  ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet);
-}
diff --git a/paddle/platform/for_range.h b/paddle/platform/for_range.h
deleted file mode 100644
index 694a66d9ac4eb6ad02daf1931806fa1287de7cab..0000000000000000000000000000000000000000
--- a/paddle/platform/for_range.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace platform {
-
-template <typename DeviceContext>
-struct ForRange {
-  ForRange(const DeviceContext& dev_ctx, size_t limit);
-
-  template <typename Function>
-  void operator()(Function func) const;
-};
-
-template <>
-struct ForRange<CPUDeviceContext> {
-  ForRange(const CPUDeviceContext& dev_ctx, size_t limit) : limit_(limit) {}
-
-  template <typename Function>
-  void operator()(Function func) const {
-    for (size_t i = 0; i < limit_; ++i) {
-      func(i);
-    }
-  }
-
-  size_t limit_;
-};
-
-#ifdef __NVCC__
-template <typename Function>
-__global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
-  size_t idx = static_cast<size_t>(threadIdx.x);
-  func(idx);
-}
-
-template <typename Function>
-__global__ static void ForRangeElemwiseOp(Function func, int limit) {
-  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
-  if (idx < limit) {
-    func(idx);
-  }
-}
-
-template <>
-struct ForRange<CUDADeviceContext> {
-  ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
-      : dev_ctx_(dev_ctx), limit_(static_cast<int>(limit)) {}
-
-  template <typename Function>
-  inline void operator()(Function func) const {
-    constexpr int num_threads = 1024;
-    int block_size = limit_ <= num_threads ? limit_ : num_threads;
-    int grid_size = (limit_ + num_threads - 1) / num_threads;
-
-    if (grid_size == 1) {
-      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
-          func);
-    } else {
-      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
-          func, limit_);
-    }
-  }
-
-  const CUDADeviceContext& dev_ctx_;
-  int limit_;
-};
-
-#endif
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
deleted file mode 100644
index 7037551d7544d6fea54e2f4bf887309b7dc5a52e..0000000000000000000000000000000000000000
--- a/paddle/platform/gpu_info.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/platform/gpu_info.h"
-
-#include "gflags/gflags.h"
-
-#include "paddle/platform/enforce.h"
-
-DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
-              "Default use 92% of GPU memory for PaddlePaddle,"
-              "reserve the rest for page tables, etc");
-
-namespace paddle {
-namespace platform {
-
-int GetCUDADeviceCount() {
-  int count;
-  PADDLE_ENFORCE(
-      cudaGetDeviceCount(&count),
-      "cudaGetDeviceCount failed in paddle::platform::GetCUDADeviceCount");
-  return count;
-}
-
-int GetCurrentDeviceId() {
-  int device_id;
-  PADDLE_ENFORCE(
-      cudaGetDevice(&device_id),
-      "cudaGetDevice failed in paddle::platform::GetCurrentDeviceId");
-  return device_id;
-}
-
-void SetDeviceId(int id) {
-  // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
-  PADDLE_ENFORCE(cudaSetDevice(id),
-                 "cudaSetDevice failed in paddle::platform::SetDeviceId");
-}
-
-void GpuMemoryUsage(size_t &available, size_t &total) {
-  PADDLE_ENFORCE(cudaMemGetInfo(&available, &total),
-                 "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
-}
-
-size_t GpuMaxAllocSize() {
-  size_t total = 0;
-  size_t available = 0;
-
-  GpuMemoryUsage(available, total);
-
-  // Reserve the rest for page tables, etc.
-  return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
-}
-
-size_t GpuMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 256 bytes.
-  return 1 << 8;
-}
-
-size_t GpuMaxChunkSize() {
-  size_t total = 0;
-  size_t available = 0;
-
-  GpuMemoryUsage(available, total);
-  VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
-           << total / 1024 / 1024 << "M";
-  size_t reserving = static_cast<size_t>(0.05 * total);
-  // If available less than minimum chunk size, no usable memory exists.
-  available =
-      std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
-               total - reserving);
-
-  // Reserving the rest memory for page tables, etc.
-
-  size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
-                                          (total - reserving));
-
-  PADDLE_ENFORCE_LE(allocating, available);
-
-  return allocating;
-}
-
-void GpuMemcpyAsync(void *dst, const void *src, size_t count,
-                    enum cudaMemcpyKind kind, cudaStream_t stream) {
-  PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),
-                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
-}
-
-void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
-                   size_t count, cudaStream_t stream) {
-  PADDLE_ENFORCE(
-      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream),
-      "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer");
-}
-
-void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
-  PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream),
-                 "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync");
-}
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu
deleted file mode 100644
index 84f5ac28be319473d045dc554bf2cb3c0e48803a..0000000000000000000000000000000000000000
--- a/paddle/platform/nccl_test.cu
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/device_vector.h>
-#include <memory>
-#include <vector>
-
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-
-#include "paddle/framework/init.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/dynload/nccl.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/gpu_info.h"
-
-static int dev_count = 0;
-
-namespace paddle {
-namespace platform {
-
-TEST(NCCL, init) {
-  std::vector<ncclComm_t> comms;
-  comms.resize(dev_count);
-  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
-
-  for (int i = 0; i < dev_count; ++i) {
-    dynload::ncclCommDestroy(comms[i]);
-  }
-}
-
-template <typename T>
-struct PerThreadData {
-  thrust::device_vector<T> send_buff;
-  thrust::device_vector<T> recv_buff;
-  CUDADeviceContext dev_ctx;
-
-  T* SendBuff() { return thrust::raw_pointer_cast(send_buff.data()); }
-
-  T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); }
-
-  PerThreadData(int gpu_id, size_t size) : dev_ctx(CUDAPlace(gpu_id)) {
-    send_buff.resize(size);
-    for (size_t i = 0; i < size; ++i) {
-      send_buff[i] = static_cast<T>(i);
-    }
-    recv_buff.resize(size);
-  }
-};
-
-static constexpr int ELEM_COUNT = 10000;
-
-TEST(NCCL, all_reduce) {
-  std::vector<ncclComm_t> comms;
-  comms.resize(dev_count);
-  VLOG(1) << "Initializing ncclComm";
-  dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
-  VLOG(1) << "ncclComm initialized";
-  VLOG(1) << "Creating thread data";
-  std::vector<std::unique_ptr<PerThreadData<double>>> data;
-  data.reserve(dev_count);
-  for (int i = 0; i < dev_count; ++i) {
-    VLOG(1) << "Creating thread data for device " << i;
-    SetDeviceId(i);
-    data.emplace_back(new PerThreadData<double>(i, ELEM_COUNT));
-  }
-  VLOG(1) << "Thread data created";
-
-  VLOG(1) << "Check send_buf data";
-  for (int i = 0; i < dev_count; ++i) {
-    VLOG(1) << "Check on device " << i;
-    SetDeviceId(i);
-    thrust::host_vector<double> tmp = data[i]->send_buff;
-    for (size_t j = 0; j < tmp.size(); ++j) {
-      ASSERT_NEAR(static_cast<double>(j), tmp[j], 1e-5);
-    }
-  }
-
-  VLOG(1) << "Invoking ncclAllReduce";
-
-  for (int i = 0; i < dev_count; ++i) {
-    VLOG(1) << "Invoking ncclAllReduce with device " << i;
-    SetDeviceId(i);
-    PADDLE_ENFORCE(dynload::ncclAllReduce(
-        data[i]->SendBuff(), data[i]->RecvBuff(), ELEM_COUNT, ncclDouble,
-        ncclSum, comms[i], data[i]->dev_ctx.stream()));
-    VLOG(1) << "Invoked ncclAllReduce for device " << i;
-  }
-
-  VLOG(1) << "Invoked ncclAllReduce";
-
-  VLOG(1) << "Sync devices";
-  for (int i = 0; i < dev_count; ++i) {
-    VLOG(1) << "Sync device " << i;
-    SetDeviceId(i);
-    data[i]->dev_ctx.Wait();
-  }
-  VLOG(1) << "device synced";
-
-  for (int i = 0; i < dev_count; ++i) {
-    SetDeviceId(i);
-    VLOG(1) << "Checking vector on device " << i;
-    thrust::host_vector<double> tmp = data[i]->recv_buff;
-    for (size_t j = 0; j < tmp.size(); ++j) {
-      auto elem = static_cast<double>(j);
-      elem *= dev_count;
-      ASSERT_NEAR(tmp[j], elem, 1e-4);
-    }
-  }
-
-  for (int i = 0; i < dev_count; ++i) {
-    dynload::ncclCommDestroy(comms[i]);
-  }
-}
-}  // namespace platform
-}  // namespace paddle
-
-int main(int argc, char** argv) {
-  // FIXME(tonyyang-svail):
-  //   Due to the driver issue on our CI, disable for now
-  return 0;
-  dev_count = paddle::platform::GetCUDADeviceCount();
-  if (dev_count <= 1) {
-    LOG(WARNING)
-        << "Cannot test multi-gpu nccl, because the CUDA device count is "
-        << dev_count;
-    return 0;
-  }
-
-  std::vector<paddle::platform::Place> places;
-
-  places.emplace_back(paddle::platform::CPUPlace());
-  int count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < count; ++i) {
-    places.emplace_back(paddle::platform::CUDAPlace(i));
-  }
-
-  VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Init(places);
-
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc
deleted file mode 100644
index f05260ccac4c2b11e5568d484141720fa4792d13..0000000000000000000000000000000000000000
--- a/paddle/platform/place.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/platform/place.h"
-
-namespace paddle {
-namespace platform {
-
-namespace detail {
-
-class PlacePrinter : public boost::static_visitor<> {
- public:
-  explicit PlacePrinter(std::ostream &os) : os_(os) {}
-  void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
-  void operator()(const CUDAPlace &p) {
-    os_ << "CUDAPlace(" << p.device << ")";
-  }
-
- private:
-  std::ostream &os_;
-};
-
-}  // namespace detail
-
-static Place the_default_place;
-
-void set_place(const Place &place) { the_default_place = place; }
-const Place &get_place() { return the_default_place; }
-
-const CUDAPlace default_gpu() { return CUDAPlace(0); }
-const CPUPlace default_cpu() { return CPUPlace(); }
-
-bool is_gpu_place(const Place &p) {
-  return boost::apply_visitor(IsCUDAPlace(), p);
-}
-
-bool is_cpu_place(const Place &p) { return !is_gpu_place(p); }
-
-bool places_are_same_class(const Place &p1, const Place &p2) {
-  return p1.which() == p2.which();
-}
-
-bool is_same_place(const Place &p1, const Place &p2) {
-  if (places_are_same_class(p1, p2)) {
-    if (is_cpu_place(p1)) {
-      return true;
-    } else {
-      return boost::get<CUDAPlace>(p1) == boost::get<CUDAPlace>(p2);
-    }
-  } else {
-    return false;
-  }
-}
-
-std::ostream &operator<<(std::ostream &os, const Place &p) {
-  detail::PlacePrinter printer(os);
-  boost::apply_visitor(printer, p);
-  return os;
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
deleted file mode 100644
index fbb43fa043a44c302e6b1cc67d83c18d791f07c5..0000000000000000000000000000000000000000
--- a/paddle/platform/place.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/variant.h"
-
-namespace paddle {
-namespace platform {
-
-struct CPUPlace {
-  // WORKAROUND: for some reason, omitting this constructor
-  // causes errors with boost 1.59 and OSX
-  CPUPlace() {}
-
-  // needed for variant equality comparison
-  inline bool operator==(const CPUPlace &) const { return true; }
-  inline bool operator!=(const CPUPlace &) const { return false; }
-};
-
-struct CUDAPlace {
-  CUDAPlace() : CUDAPlace(0) {}
-  explicit CUDAPlace(int d) : device(d) {}
-
-  inline int GetDeviceId() const { return device; }
-  // needed for variant equality comparison
-  inline bool operator==(const CUDAPlace &o) const {
-    return device == o.device;
-  }
-  inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
-
-  int device;
-};
-
-struct IsCUDAPlace : public boost::static_visitor<bool> {
-  bool operator()(const CPUPlace &) const { return false; }
-  bool operator()(const CUDAPlace &gpu) const { return true; }
-};
-
-typedef boost::variant<CUDAPlace, CPUPlace> Place;
-
-using PlaceList = std::vector<Place>;
-
-void set_place(const Place &);
-const Place &get_place();
-
-const CUDAPlace default_gpu();
-const CPUPlace default_cpu();
-
-bool is_gpu_place(const Place &);
-bool is_cpu_place(const Place &);
-bool places_are_same_class(const Place &, const Place &);
-bool is_same_place(const Place &, const Place &);
-
-std::ostream &operator<<(std::ostream &, const Place &);
-
-template <typename Visitor>
-struct PlaceVisitorWrapper
-    : public boost::static_visitor<typename Visitor::result_type> {
-  const Visitor &visitor_;
-  explicit PlaceVisitorWrapper(const Visitor &visitor) : visitor_(visitor) {}
-
-  typename Visitor::result_type operator()(const CPUPlace &cpu) const {
-    return visitor_(cpu);
-  }
-
-  typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
-#ifdef PADDLE_WITH_CUDA
-    return visitor_(cuda);
-#else
-    PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda device");
-    return typename Visitor::result_type();
-#endif
-  }
-};
-
-template <typename Visitor>
-typename Visitor::result_type VisitPlace(const Place &place,
-                                         const Visitor &visitor) {
-  return boost::apply_visitor(PlaceVisitorWrapper<Visitor>(visitor), place);
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/place_test.cc b/paddle/platform/place_test.cc
deleted file mode 100644
index 150b2d3b1fbacec18ea33156f30f1c9965aedb31..0000000000000000000000000000000000000000
--- a/paddle/platform/place_test.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/platform/place.h"
-#include <sstream>
-#include "gtest/gtest.h"
-
-TEST(Place, Equality) {
-  paddle::platform::CPUPlace cpu;
-  paddle::platform::CUDAPlace g0(0), g1(1), gg0(0);
-
-  EXPECT_EQ(cpu, cpu);
-  EXPECT_EQ(g0, g0);
-  EXPECT_EQ(g1, g1);
-  EXPECT_EQ(g0, gg0);
-
-  EXPECT_NE(g0, g1);
-
-  EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0));
-  EXPECT_FALSE(paddle::platform::places_are_same_class(g0, cpu));
-}
-
-TEST(Place, Default) {
-  EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::get_place()));
-  EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::default_gpu()));
-  EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::default_cpu()));
-
-  EXPECT_FALSE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
-  paddle::platform::set_place(paddle::platform::CPUPlace());
-  EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
-}
-
-TEST(Place, Print) {
-  {
-    std::stringstream ss;
-    ss << paddle::platform::CUDAPlace(1);
-    EXPECT_EQ("CUDAPlace(1)", ss.str());
-  }
-  {
-    std::stringstream ss;
-    ss << paddle::platform::CPUPlace();
-    EXPECT_EQ("CPUPlace", ss.str());
-  }
-}
diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc
deleted file mode 100644
index 6df087d154cc104955c6399050c9cb2bce8d36e1..0000000000000000000000000000000000000000
--- a/paddle/platform/profiler.cc
+++ /dev/null
@@ -1,346 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/platform/profiler.h"
-#include <iomanip>
-#include <map>
-#include "glog/logging.h"
-
-namespace paddle {
-namespace platform {
-
-// The profiler state, the initial value is ProfilerState::kDisabled
-static ProfilerState g_state = ProfilerState::kDisabled;
-// To record which timer the profiler used, CUDA or CPU.
-static std::string g_profiler_place = "";
-// The thread local event list only can be accessed by the specific thread
-// The thread index of each thread
-static thread_local int32_t g_thread_id;
-// The g_next_thread_id is a global counter for threads, by the g_thread_id and
-// g_next_thread_id, we can know how many threads have created EventList.
-static uint32_t g_next_thread_id = 0;
-// The global mutex
-static std::mutex g_all_event_lists_mutex;
-// The total event lists of all threads
-static std::list<std::shared_ptr<EventList>> g_all_event_lists;
-// The thread local event list only can be accessed by the specific thread
-static thread_local std::shared_ptr<EventList> g_event_list;
-
-inline uint64_t GetTimeInNsec() {
-  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
-                                 std::chrono::high_resolution_clock,
-                                 std::chrono::steady_clock>::type;
-  return std::chrono::duration_cast<std::chrono::nanoseconds>(
-             clock::now().time_since_epoch())
-      .count();
-}
-
-Event::Event(EventKind kind, std::string name, uint32_t thread_id,
-             const DeviceContext* dev_ctx)
-    : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
-#ifdef PADDLE_WITH_CUDA
-  has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
-  if (has_cuda_) {
-    auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
-    PADDLE_ENFORCE(cudaGetDevice(&device_));
-    PADDLE_ENFORCE(cudaEventCreate(&event_));
-    auto stream = cuda_dev_ctx->stream();
-    PADDLE_ENFORCE(cudaEventRecord(event_, stream));
-  }
-#endif
-  cpu_ns_ = GetTimeInNsec();
-}
-
-std::string Event::kind() const {
-  switch (kind_) {
-    case EventKind::kMark:
-      return "mark";
-    case EventKind::kPushRange:
-      return "push";
-    case EventKind::kPopRange:
-      return "pop";
-  }
-  PADDLE_THROW("Unknown EventKind.");
-}
-
-double Event::CpuElapsedMs(const Event& e) const {
-  return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
-}
-
-double Event::CudaElapsedMs(const Event& e) const {
-#ifdef PADDLE_WITH_CUDA
-  PADDLE_ENFORCE(e.has_cuda() && has_cuda());
-  PADDLE_ENFORCE(e.device() == device());
-  PADDLE_ENFORCE(cudaEventSynchronize(event_));
-  PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
-  float ms;
-  PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
-  return ms;
-#else
-  PADDLE_THROW("CUDA is not enabled");
-#endif
-}
-
-#ifdef PADDLE_WITH_CUDA
-static void ForEachDevice(std::function<void(int)> func) {
-  auto original_device = GetCurrentDeviceId();
-  int count = GetCUDADeviceCount();
-  for (int i = 0; i < count; i++) {
-    SetDeviceId(i);
-    func(i);
-  }
-  SetDeviceId(original_device);
-}
-#endif
-
-inline EventList& GetEventList() {
-  if (!g_event_list) {
-    std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
-    g_event_list = std::make_shared<EventList>();
-    g_thread_id = g_next_thread_id++;
-    g_all_event_lists.emplace_front(g_event_list);
-  }
-  return *g_event_list;
-}
-
-void Mark(const std::string& name, const DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
-}
-
-void PushEvent(const std::string& name, const DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
-}
-
-void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
-}
-
-RecordEvent::RecordEvent(const std::string& name,
-                         const DeviceContext* dev_ctx) {
-  if (g_state == ProfilerState::kDisabled) return;
-  dev_ctx_ = dev_ctx;
-  name_ = name;
-  PushEvent(name_, dev_ctx_);
-}
-
-RecordEvent::~RecordEvent() {
-  if (g_state == ProfilerState::kDisabled) return;
-  PopEvent(name_, dev_ctx_);
-}
-
-void EnableProfiler(ProfilerState state) {
-  PADDLE_ENFORCE(state != ProfilerState::kDisabled,
-                 "Can't enbale profling, since the input state is ",
-                 "ProfilerState::kDisabled");
-  PADDLE_ENFORCE(g_state == ProfilerState::kDisabled,
-                 "The profiling state should be disabled when calling ",
-                 "EnableProfiler.");
-  g_state = state;
-  g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU";
-#ifdef PADDLE_WITH_CUDA
-  if (g_state == ProfilerState::kCUDA) {
-    // Generate some dummy evenets first to reduce the startup overhead.
-    for (int i = 0; i < 5; i++) {
-      ForEachDevice([](int d) {
-        DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
-        Mark("_cuda_startup_", dev_ctx);
-        dev_ctx->Wait();
-        delete dev_ctx;
-      });
-    }
-  }
-#endif
-  // Mark the profiling start.
-  Mark("_start_profiler_", nullptr);
-}
-
-void ResetProfiler() {
-  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
-  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
-       ++it) {
-    (*it)->Clear();
-  }
-}
-
-std::vector<std::vector<Event>> GetAllEvents() {
-  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
-  std::vector<std::vector<Event>> result;
-  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
-       ++it) {
-    result.emplace_back((*it)->Reduce());
-  }
-  return result;
-}
-
-void DisableProfiler(EventSortingKey sorted_key) {
-  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
-                 "Can't disable profiling, since it's not starting.");
-  // Mark the profiling stop.
-  Mark("_stop_profiler_", nullptr);
-  g_state = ProfilerState::kDisabled;
-
-  std::vector<std::vector<Event>> all_events = GetAllEvents();
-  ParseEvents(all_events, sorted_key);
-  ResetProfiler();
-}
-
-void ParseEvents(std::vector<std::vector<Event>>& events,
-                 EventSortingKey sorted_by) {
-  if (g_profiler_place == "") return;
-
-  std::string sorted_domain;
-  std::function<bool(const EventItem&, const EventItem&)> sorted_func;
-  switch (sorted_by) {
-    case EventSortingKey::kCalls:
-      sorted_domain = "number of calls";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
-        return a.calls > b.calls;
-      };
-      break;
-    case EventSortingKey::kTotal:
-      sorted_domain = "total time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
-        return a.total_time > b.total_time;
-      };
-      break;
-    case EventSortingKey::kMin:
-      sorted_domain = "minimum time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
-        return a.min_time > b.min_time;
-      };
-      break;
-    case EventSortingKey::kMax:
-      sorted_domain = "maximum time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
-        return a.max_time > b.max_time;
-      };
-      break;
-    case EventSortingKey::kAve:
-      sorted_domain = "average time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
-        return a.ave_time > b.ave_time;
-      };
-      break;
-    default:
-      sorted_domain = "event first end time";
-  }
-
-  std::vector<std::vector<EventItem>> events_table;
-  size_t max_name_width = 0;
-  for (size_t i = 0; i < events.size(); i++) {
-    std::list<Event> pushed_events;
-    std::vector<EventItem> event_items;
-    std::unordered_map<std::string, int> event_idx;
-
-    for (size_t j = 0; j < events[i].size(); j++) {
-      if (events[i][j].kind() == "push") {
-        pushed_events.push_back(events[i][j]);
-      } else if (events[i][j].kind() == "pop") {
-        std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
-        while (rit != pushed_events.rend() &&
-               rit->name() != events[i][j].name()) {
-          ++rit;
-        }
-
-        if (rit != pushed_events.rend()) {
-          double event_time = (g_profiler_place == "CUDA")
-                                  ? rit->CudaElapsedMs(events[i][j])
-                                  : rit->CpuElapsedMs(events[i][j]);
-          std::string event_name =
-              "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
-          max_name_width = std::max(max_name_width, event_name.size());
-
-          if (event_idx.find(event_name) == event_idx.end()) {
-            event_idx[event_name] = event_items.size();
-            EventItem event_item = {event_name, 1,          event_time,
-                                    event_time, event_time, event_time};
-            event_items.push_back(event_item);
-          } else {
-            int index = event_idx[event_name];
-            event_items[index].calls += 1;
-            // total time
-            event_items[index].total_time += event_time;
-            // min time
-            event_items[index].min_time =
-                std::min(event_time, event_items[index].min_time);
-            // max time
-            event_items[index].max_time =
-                std::max(event_time, event_items[index].max_time);
-          }
-
-          // remove the push marker from the list
-          pushed_events.erase((++rit).base());
-        } else {
-          LOG(WARNING) << "Cannot find the push marker of event \'"
-                       << events[i][j].name()
-                       << "\', which will be ignored in profiling report.";
-        }
-      }
-    }
-    // average time
-    for (auto& item : event_items) {
-      item.ave_time = item.total_time / item.calls;
-    }
-    // sort
-    if (sorted_by != EventSortingKey::kDefault) {
-      std::sort(event_items.begin(), event_items.end(), sorted_func);
-    }
-
-    events_table.push_back(event_items);
-    // log warning if there are events with `push` but without `pop`
-    std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
-    while (rit != pushed_events.rend()) {
-      LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name()
-                   << "\', which will be ignored in profiling report.";
-      ++rit;
-    }
-  }
-
-  // Print report
-  PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12);
-}
-
-void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
-                   std::string& sorted_domain, const size_t name_width,
-                   const size_t data_width) {
-  // Output header information
-  std::cout << "\n------------------------->"
-            << "     Profiling Report     "
-            << "<-------------------------\n\n";
-  std::cout << "Place: " << g_profiler_place << std::endl;
-  std::cout << "Time unit: ms" << std::endl;
-  std::cout << "Sorted by " << sorted_domain
-            << " in descending order in the same thread\n\n";
-  // Output events table
-  std::cout.setf(std::ios::left);
-  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
-            << "Calls" << std::setw(data_width) << "Total"
-            << std::setw(data_width) << "Min." << std::setw(data_width)
-            << "Max." << std::setw(data_width) << "Ave." << std::endl;
-  for (size_t i = 0; i < events_table.size(); ++i) {
-    for (size_t j = 0; j < events_table[i].size(); ++j) {
-      EventItem& event_item = events_table[i][j];
-      std::cout << std::setw(name_width) << event_item.name
-                << std::setw(data_width) << event_item.calls
-                << std::setw(data_width) << event_item.total_time
-                << std::setw(data_width) << event_item.min_time
-                << std::setw(data_width) << event_item.max_time
-                << std::setw(data_width) << event_item.ave_time << std::endl;
-    }
-  }
-  std::cout << std::endl;
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h
deleted file mode 100644
index 8de1e6ad296d1e15c1659ccf431f1d5013eb608c..0000000000000000000000000000000000000000
--- a/paddle/platform/profiler.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <forward_list>
-#include <list>
-#include <mutex>
-#include <vector>
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace platform {
-
-enum EventKind { kMark, kPushRange, kPopRange };
-
-class Event {
- public:
-  // The DeviceContext is used to get the cuda stream.
-  // If CPU profiling mode, can pass nullptr.
-  Event(EventKind kind, std::string name, uint32_t thread_id,
-        const DeviceContext* dev_ctx);
-
-  std::string kind() const;
-  std::string name() const { return name_; }
-  uint32_t thread_id() const { return thread_id_; }
-  bool has_cuda() const { return has_cuda_; }
-
-#ifdef PADDLE_WITH_CUDA
-  cudaEvent_t event() const { return event_; }
-  int device() const { return device_; }
-#endif
-
-  double CpuElapsedMs(const Event& e) const;
-  double CudaElapsedMs(const Event& e) const;
-
- private:
-  EventKind kind_;
-  std::string name_;
-  uint32_t thread_id_;
-  int64_t cpu_ns_;
-  bool has_cuda_;
-#ifdef PADDLE_WITH_CUDA
-  cudaEvent_t event_ = nullptr;
-  int device_ = -1;
-#endif
-};
-
-struct EventList {
-  constexpr static size_t kMB = 1024 * 1024;
-  constexpr static size_t kEventBlockSize = 16 * kMB;
-  constexpr static size_t kEventSize = sizeof(Event);
-  constexpr static size_t kEventAlign = alignof(Event);
-  constexpr static size_t kNumBlock =
-      kEventBlockSize /
-      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
-
-  template <typename... Args>
-  void Record(Args&&... args) {
-    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
-      event_blocks.emplace_front();
-      event_blocks.front().reserve(kNumBlock);
-    }
-    event_blocks.front().emplace_back(std::forward<Args>(args)...);
-  }
-
-  std::vector<Event> Reduce() {
-    std::vector<Event> result;
-    for (auto& block : event_blocks) {
-      result.insert(result.begin(), std::make_move_iterator(block.begin()),
-                    std::make_move_iterator(block.end()));
-    }
-    event_blocks.clear();
-    return result;
-  }
-
-  void Clear() { event_blocks.clear(); }
-
-  std::forward_list<std::vector<Event>> event_blocks;
-};
-
-enum ProfilerState {
-  kDisabled,  // disabled state
-  kCPU,       // CPU profiling state
-  kCUDA,      // GPU profiling state
-};
-
-void Mark(const std::string& name, const DeviceContext* dev_ctx);
-
-void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
-
-void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
-
-struct RecordEvent {
-  explicit RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
-
-  ~RecordEvent();
-
-  // The device context is used by Event to get the current cuda stream.
-  const DeviceContext* dev_ctx_;
-  // Event name
-  std::string name_;
-};
-
-// Return the event list of all threads. Asummed the returned value calls
-// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
-std::vector<std::vector<Event>> GetAllEvents();
-
-// The information of each event given in the profiling report
-struct EventItem {
-  std::string name;
-  int calls;
-  double total_time;
-  double min_time;
-  double max_time;
-  double ave_time;
-};
-
-// Candidate keys to sort the profiling report
-enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
-
-// Enable the profiling function.
-void EnableProfiler(ProfilerState state);
-
-// Clear the g_all_event_lists, which is total event lists of all threads.
-void ResetProfiler();
-
-void DisableProfiler(EventSortingKey sorted_key);
-
-// Parse the event list and output the profiling report
-void ParseEvents(std::vector<std::vector<Event>>&,
-                 EventSortingKey sorted_by = EventSortingKey::kDefault);
-
-// Print results
-void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
-                   std::string& sorted_domain, const size_t name_width,
-                   const size_t data_width);
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc
deleted file mode 100644
index 81f10c91342f76910cc780b0ebd0c0df04e9d7bf..0000000000000000000000000000000000000000
--- a/paddle/platform/profiler_test.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/platform/profiler.h"
-#include "gtest/gtest.h"
-
-TEST(Event, CpuElapsedTime) {
-  using paddle::platform::Event;
-  using paddle::platform::EventKind;
-
-  Event start_event(EventKind::kPushRange, "test", 0, nullptr);
-  EXPECT_TRUE(start_event.has_cuda() == false);
-  int counter = 0;
-  while (counter != 1000) {
-    counter++;
-  }
-  Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
-  EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(Event, CudaElapsedTime) {
-  using paddle::platform::DeviceContext;
-  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::CUDAPlace;
-  using paddle::platform::Event;
-  using paddle::platform::EventKind;
-
-  DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
-  Event start_event(EventKind::kPushRange, "test", 0, dev_ctx);
-  EXPECT_TRUE(start_event.has_cuda() == true);
-  int counter = 0;
-  while (counter != 1000) {
-    counter++;
-  }
-  Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx);
-  EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
-}
-#endif
-
-TEST(RecordEvent, RecordEvent) {
-  using paddle::platform::DeviceContext;
-  using paddle::platform::Event;
-  using paddle::platform::EventKind;
-  using paddle::platform::RecordEvent;
-  using paddle::platform::ProfilerState;
-  using paddle::platform::EventSortingKey;
-
-  ProfilerState state = ProfilerState::kCPU;
-  DeviceContext* dev_ctx = nullptr;
-#ifdef PADDLE_WITH_CUDA
-  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::CUDAPlace;
-  state = ProfilerState::kCUDA;
-  dev_ctx =
-      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0));
-#endif
-  EnableProfiler(state);
-
-  /* Usage 1:
-  *  PushEvent(evt_name, dev_ctx);
-  *  ...
-  *  code to be analyzed
-  *  ...
-  * PopEvent(evt_name, dev_ctx);
-  */
-  for (int loop = 0; loop < 3; ++loop) {
-    for (int i = 1; i < 5; ++i) {
-      std::string name = "op_" + std::to_string(i);
-      PushEvent(name, dev_ctx);
-      int counter = 1;
-      while (counter != i * 1000) counter++;
-      PopEvent(name, dev_ctx);
-    }
-  }
-
-  /* Usage 2:
-   * {
-   *   RecordEvent record_event(name, dev_ctx);
-   *   ...
-   *   code to be analyzed
-   *   ...
-   * }
-   */
-  for (int i = 1; i < 5; ++i) {
-    std::string name = "evs_op_" + std::to_string(i);
-    RecordEvent record_event(name, dev_ctx);
-    int counter = 1;
-    while (counter != i * 1000) counter++;
-  }
-
-  // Bad Usage:
-  PushEvent("event_without_pop", dev_ctx);
-  PopEvent("event_without_push", dev_ctx);
-  std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();
-
-  int cuda_startup_count = 0;
-  int start_profiler_count = 0;
-  for (size_t i = 0; i < events.size(); ++i) {
-    for (size_t j = 0; j < events[i].size(); ++j) {
-      if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count;
-      if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
-      if (events[i][j].name() == "push") {
-        EXPECT_EQ(events[i][j + 1].name(), "pop");
-#ifdef PADDLE_WITH_CUDA
-        EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0);
-#else
-        EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0);
-#endif
-      }
-    }
-  }
-  EXPECT_EQ(cuda_startup_count % 5, 0);
-  EXPECT_EQ(start_profiler_count, 1);
-
-  // Will remove parsing-related code from test later
-  DisableProfiler(EventSortingKey::kTotal);
-}
diff --git a/paddle/platform/transform.h b/paddle/platform/transform.h
deleted file mode 100644
index a88902b164c7a705cae09662e724b35cd8c8b4bf..0000000000000000000000000000000000000000
--- a/paddle/platform/transform.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/hostdevice.h"
-#include "paddle/platform/place.h"
-
-#include <algorithm>
-#include <type_traits>
-#ifdef __NVCC__
-#include <thrust/execution_policy.h>
-#include <thrust/transform.h>
-#include "paddle/platform/details/device_ptr_cast.h"
-#endif
-
-namespace paddle {
-namespace platform {
-
-// Transform on host or device. It provides the same API in std library.
-template <typename DeviceContext>
-struct Transform {
-  template <typename InputIter, typename OutputIter, typename UnaryOperation>
-  void operator()(const DeviceContext& context, InputIter first, InputIter last,
-                  OutputIter result, UnaryOperation op);
-
-  template <typename InputIter1, typename InputIter2, typename OutputIter,
-            typename BinaryOperation>
-  void operator()(const DeviceContext& context, InputIter1 first1,
-                  InputIter1 last1, InputIter2 first2, OutputIter result,
-                  BinaryOperation op);
-};
-
-template <>
-struct Transform<platform::CPUDeviceContext> {
-  template <typename InputIter, typename OutputIter, typename UnaryOperation>
-  void operator()(const platform::CPUDeviceContext& context, InputIter first,
-                  InputIter last, OutputIter result, UnaryOperation op) {
-    std::transform(first, last, result, op);
-  }
-
-  template <typename InputIter1, typename InputIter2, typename OutputIter,
-            typename BinaryOperation>
-  void operator()(const platform::CPUDeviceContext& context, InputIter1 first1,
-                  InputIter1 last1, InputIter2 first2, OutputIter result,
-                  BinaryOperation op) {
-    std::transform(first1, last1, first2, result, op);
-  }
-};
-
-#ifdef __NVCC__
-template <>
-struct Transform<platform::CUDADeviceContext> {
-  template <typename InputIter, typename OutputIter, typename UnaryOperation>
-  void operator()(const platform::CUDADeviceContext& context, InputIter first,
-                  InputIter last, OutputIter result, UnaryOperation op) {
-    auto place = context.GetPlace();
-    PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
-    thrust::transform(thrust::cuda::par.on(context.stream()),
-                      details::DevPtrCast(first), details::DevPtrCast(last),
-                      details::DevPtrCast(result), op);
-  }
-
-  template <typename InputIter1, typename InputIter2, typename OutputIter,
-            typename BinaryOperation>
-  void operator()(const platform::CUDADeviceContext& context, InputIter1 first1,
-                  InputIter1 last1, InputIter2 first2, OutputIter result,
-                  BinaryOperation op) {
-    auto place = context.GetPlace();
-    PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
-    thrust::transform(thrust::cuda::par.on(context.stream()),
-                      details::DevPtrCast(first1), details::DevPtrCast(last1),
-                      details::DevPtrCast(first2), details::DevPtrCast(result),
-                      op);
-  }
-};
-#endif
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/transform_test.cu b/paddle/platform/transform_test.cu
deleted file mode 100644
index af9204a0a7b6e09fcfdacb8ba985e269665b4034..0000000000000000000000000000000000000000
--- a/paddle/platform/transform_test.cu
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/memory/memcpy.h"
-#include "paddle/memory/memory.h"
-#include "paddle/platform/hostdevice.h"
-#include "paddle/platform/transform.h"
-
-template <typename T>
-class Scale {
- public:
-  explicit Scale(const T& scale) : scale_(scale) {}
-
-  HOSTDEVICE T operator()(const T& a) const { return a * scale_; }
-
- private:
-  T scale_;
-};
-
-template <typename T>
-class Multiply {
- public:
-  HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
-};
-
-TEST(Transform, CPUUnary) {
-  using namespace paddle::platform;
-  CPUDeviceContext ctx;
-  float buf[4] = {0.1, 0.2, 0.3, 0.4};
-  Transform<paddle::platform::CPUDeviceContext> trans;
-  trans(ctx, buf, buf + 4, buf, Scale<float>(10));
-  for (int i = 0; i < 4; ++i) {
-    ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
-  }
-}
-
-TEST(Transform, GPUUnary) {
-  using namespace paddle::platform;
-  using namespace paddle::memory;
-  CUDAPlace gpu0(0);
-  CUDADeviceContext ctx(gpu0);
-  float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
-  float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
-  Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
-  Transform<paddle::platform::CUDADeviceContext> trans;
-  trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
-  ctx.Wait();
-  Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
-  Free(gpu0, gpu_buf);
-  for (int i = 0; i < 4; ++i) {
-    ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
-  }
-}
-
-TEST(Transform, CPUBinary) {
-  using namespace paddle::platform;
-  using namespace paddle::memory;
-  int buf[4] = {1, 2, 3, 4};
-  Transform<paddle::platform::CPUDeviceContext> trans;
-  CPUDeviceContext ctx;
-  trans(ctx, buf, buf + 4, buf, buf, Multiply<int>());
-  for (int i = 0; i < 4; ++i) {
-    ASSERT_EQ((i + 1) * (i + 1), buf[i]);
-  }
-}
-
-TEST(Transform, GPUBinary) {
-  using namespace paddle::platform;
-  using namespace paddle::memory;
-  int buf[4] = {1, 2, 3, 4};
-  CUDAPlace gpu0(0);
-  CUDADeviceContext ctx(gpu0);
-  int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
-  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
-  Transform<paddle::platform::CUDADeviceContext> trans;
-  trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
-  ctx.Wait();
-  Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());
-  Free(gpu0, gpu_buf);
-  for (int i = 0; i < 4; ++i) {
-    ASSERT_EQ((i + 1) * (i + 1), buf[i]);
-  }
-}
diff --git a/paddle/pybind/.clang-format b/paddle/pybind/.clang-format
deleted file mode 120000
index 7d28cb3924707d39dafe20f4664fb17b5538996c..0000000000000000000000000000000000000000
--- a/paddle/pybind/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-../framework/.clang-format
\ No newline at end of file
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
deleted file mode 100644
index de53fea0dd692167d61fcca552cc834a7916e209..0000000000000000000000000000000000000000
--- a/paddle/pybind/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-if(WITH_PYTHON)
-  cc_library(paddle_pybind SHARED
-    SRCS pybind.cc exception.cc protobuf.cc const_value.cc
-    DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
-    ${GLOB_OP_LIB})
-  if(NOT APPLE AND NOT ANDROID)
-    target_link_libraries(paddle_pybind rt)
-  endif(NOT APPLE AND NOT ANDROID)
-endif(WITH_PYTHON)
-
-if(WITH_DOC)
-  cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB})
-endif(WITH_DOC)
diff --git a/paddle/pybind/const_value.cc b/paddle/pybind/const_value.cc
deleted file mode 100644
index b13ad42ea29453354798d88bff8ef47339d1a614..0000000000000000000000000000000000000000
--- a/paddle/pybind/const_value.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "const_value.h"
-#include "paddle/framework/operator.h"
-
-namespace paddle {
-namespace pybind {
-
-void BindConstValue(pybind11::module& m) {
-  m.def("kEmptyVarName", [] { return framework::kEmptyVarName; });
-  m.def("kTempVarName", [] { return framework::kTempVarName; });
-  m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
-  m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
-}
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/pybind/const_value.h b/paddle/pybind/const_value.h
deleted file mode 100644
index 3d57c972a9d5339c0e155fa1d6395af9face8744..0000000000000000000000000000000000000000
--- a/paddle/pybind/const_value.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <Python.h>
-#include "paddle/platform/enforce.h"
-#include "pybind11/pybind11.h"
-
-namespace py = pybind11;
-
-namespace paddle {
-namespace pybind {
-extern void BindConstValue(pybind11::module& m);
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/pybind/exception.cc b/paddle/pybind/exception.cc
deleted file mode 100644
index e29ac3ebab760a011a3798f8e4be46270d6b80cc..0000000000000000000000000000000000000000
--- a/paddle/pybind/exception.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pybind/exception.h"
-
-namespace paddle {
-namespace pybind {
-
-void BindException(pybind11::module& m) {
-  static pybind11::exception<platform::EnforceNotMet> exc(m, "EnforceNotMet");
-  pybind11::register_exception_translator([](std::exception_ptr p) {
-    try {
-      if (p) std::rethrow_exception(p);
-    } catch (const platform::EnforceNotMet& e) {
-      exc(e.what());
-    }
-  });
-
-  m.def("__unittest_throw_exception__", [] { PADDLE_THROW("test exception"); });
-}
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/pybind/exception.h b/paddle/pybind/exception.h
deleted file mode 100644
index 436ddd5707ace37a5668c8d4401c1bdcf2dadfe3..0000000000000000000000000000000000000000
--- a/paddle/pybind/exception.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <Python.h>
-#include "paddle/platform/enforce.h"
-#include "pybind11/pybind11.h"
-namespace paddle {
-namespace pybind {
-
-extern void BindException(pybind11::module& m);
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/pybind/print_operators_doc.cc b/paddle/pybind/print_operators_doc.cc
deleted file mode 100644
index b55ddee17616ced4de659be8e55acd5e072c66b7..0000000000000000000000000000000000000000
--- a/paddle/pybind/print_operators_doc.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <iostream>
-#include <sstream>  // std::stringstream
-#include <string>
-
-#include "paddle/framework/op_info.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/pybind/pybind.h"
-
-std::string Escape(const std::string& s) {
-  std::string r;
-  for (size_t i = 0; i < s.size(); i++) {
-    switch (s[i]) {
-      case '\"':
-        r += "\\\"";
-        break;
-      case '\\':
-        r += "\\\\";
-        break;
-      case '\n':
-        r += "\\n";
-        break;
-      case '\t':
-        r += "\\t";
-      case '\r':
-        break;
-      default:
-        r += s[i];
-        break;
-    }
-  }
-  return r;
-}
-
-std::string AttrType(paddle::framework::proto::AttrType at) {
-  switch (at) {
-    case paddle::framework::proto::INT:
-      return "int";
-    case paddle::framework::proto::FLOAT:
-      return "float";
-    case paddle::framework::proto::STRING:
-      return "string";
-    case paddle::framework::proto::BOOLEAN:
-      return "bool";
-    case paddle::framework::proto::INTS:
-      return "int array";
-    case paddle::framework::proto::FLOATS:
-      return "float array";
-    case paddle::framework::proto::STRINGS:
-      return "string array";
-    case paddle::framework::proto::BOOLEANS:
-      return "bool array";
-    case paddle::framework::proto::BLOCK:
-      return "block id";
-    case paddle::framework::proto::LONG:
-      return "long";
-  }
-  return "UNKNOWN";  // not possible
-}
-
-void PrintVar(const paddle::framework::proto::OpProto::Var& v,
-              std::stringstream& ss) {
-  ss << " { "
-     << "\n"
-     << "   \"name\" : \"" << Escape(v.name()) << "\",\n"
-     << "   \"comment\" : \"" << Escape(v.comment()) << "\",\n"
-     << "   \"duplicable\" : " << v.duplicable() << ",\n"
-     << "   \"intermediate\" : " << v.intermediate() << "\n"
-     << " },";
-}
-
-void PrintAttr(const paddle::framework::proto::OpProto::Attr& a,
-               std::stringstream& ss) {
-  ss << " { "
-     << "\n"
-     << "   \"name\" : \"" << Escape(a.name()) << "\",\n"
-     << "   \"type\" : \"" << AttrType(a.type()) << "\",\n"
-     << "   \"comment\" : \"" << Escape(a.comment()) << "\",\n"
-     << "   \"generated\" : " << a.generated() << "\n"
-     << " },";
-}
-
-void PrintOpProto(const std::string& type,
-                  const paddle::framework::OpInfo& opinfo,
-                  std::stringstream& ss) {
-  std::cerr << "Processing " << type << "\n";
-
-  const paddle::framework::proto::OpProto* p = opinfo.proto_;
-  if (p == nullptr) {
-    return;  // It is possible that an operator doesn't have OpProto.
-  }
-
-  ss << "{\n"
-     << " \"type\" : \"" << Escape(p->type()) << "\",\n"
-     << " \"comment\" : \"" << Escape(p->comment()) << "\",\n";
-
-  ss << " \"inputs\" : [ "
-     << "\n";
-  for (int i = 0; i < p->inputs_size(); i++) {
-    PrintVar(p->inputs(i), ss);
-  }
-  ss.seekp(-1, ss.cur);  // remove the trailing comma
-  ss << " ], "
-     << "\n";
-
-  ss << " \"outputs\" : [ "
-     << "\n";
-  for (int i = 0; i < p->outputs_size(); i++) {
-    PrintVar(p->outputs(i), ss);
-  }
-  ss.seekp(-1, ss.cur);  // remove the trailing comma
-  ss << " ], "
-     << "\n";
-
-  ss << " \"attrs\" : [ "
-     << "\n";
-  for (int i = 0; i < p->attrs_size(); i++) {
-    PrintAttr(p->attrs(i), ss);
-  }
-  ss.seekp(-1, ss.cur);  // remove the trailing comma
-  ss << " ] "
-     << "\n";
-
-  ss << "},";
-}
-
-int main() {
-  std::stringstream ss;
-  ss << "[\n";
-  for (auto& iter : paddle::framework::OpInfoMap::Instance().map()) {
-    PrintOpProto(iter.first, iter.second, ss);
-  }
-  ss.seekp(-1, ss.cur);  // remove the trailing comma
-  ss << "]\n";
-  std::cout << ss.str();
-}
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
deleted file mode 100644
index 02aeae8b3d2bdb9a9ec851cd4dbdbec6e3d867f6..0000000000000000000000000000000000000000
--- a/paddle/pybind/protobuf.cc
+++ /dev/null
@@ -1,298 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pybind/protobuf.h"
-#include <deque>
-#include <iostream>
-#include "paddle/framework/backward.h"
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/program_desc.h"
-#include "paddle/framework/var_desc.h"
-
-// Cast boost::variant for PyBind.
-// Copy from
-// https://github.com/pybind/pybind11/issues/576#issuecomment-269563199
-namespace pybind11 {
-namespace detail {
-
-// Can be replaced by a generic lambda in C++14
-struct variant_caster_visitor : public boost::static_visitor<handle> {
-  return_value_policy policy;
-  handle parent;
-
-  variant_caster_visitor(return_value_policy policy, handle parent)
-      : policy(policy), parent(parent) {}
-
-  template <class T>
-  handle operator()(T const &src) const {
-    return make_caster<T>::cast(src, policy, parent);
-  }
-};
-
-template <class Variant>
-struct variant_caster;
-
-template <template <class...> class V, class... Ts>
-struct variant_caster<V<Ts...>> {
-  using Type = V<Ts...>;
-
-  template <typename T>
-  typename std::enable_if<
-      !std::is_same<T, boost::detail::variant::void_>::value, bool>::type
-  try_load(handle src, bool convert) {
-    auto caster = make_caster<T>();
-    if (!load_success_ && caster.load(src, convert)) {
-      load_success_ = true;
-      value = cast_op<T>(caster);
-      return true;
-    }
-    return false;
-  }
-
-  template <typename T>
-  typename std::enable_if<std::is_same<T, boost::detail::variant::void_>::value,
-                          bool>::type
-  try_load(handle src, bool convert) {
-    return false;
-  }
-
-  bool load(handle src, bool convert) {
-    auto unused = {false, try_load<Ts>(src, convert)...};
-    (void)(unused);
-    return load_success_;
-  }
-
-  static handle cast(Type const &src, return_value_policy policy,
-                     handle parent) {
-    variant_caster_visitor visitor(policy, parent);
-    return boost::apply_visitor(visitor, src);
-  }
-
-  PYBIND11_TYPE_CASTER(Type, _("Variant"));
-  bool load_success_{false};
-};
-
-// Add specialization for concrete variant type
-template <class... Args>
-struct type_caster<boost::variant<Args...>>
-    : variant_caster<boost::variant<Args...>> {};
-
-}  // namespace detail
-}  // namespace pybind11
-
-namespace paddle {
-namespace pybind {
-
-using namespace paddle::framework;  // NOLINT
-
-template <typename T>
-static py::bytes SerializeMessage(T &self) {
-  // Check IsInitialized in Python
-  std::string retv;
-  PADDLE_ENFORCE(self.Proto()->SerializePartialToString(&retv),
-                 "Cannot serialize message");
-  return retv;
-}
-
-// Bind Methods
-void BindProgramDesc(py::module &m) {
-  py::class_<ProgramDesc>(m, "ProgramDesc", "")
-      .def(py::init<>())
-      .def("__init__",
-           [](ProgramDesc &self, const ProgramDesc &other) {
-             new (&self) ProgramDesc(other);
-           })
-      .def("__init__",
-           [](ProgramDesc &self, const py::bytes &binary_str) {
-             std::string str(binary_str);
-             new (&self) ProgramDesc(str);
-           })
-      .def("append_block", &ProgramDesc::AppendBlock,
-           py::return_value_policy::reference)
-      .def("append_backward",
-           [](ProgramDesc &program_desc, const VarDesc &target,
-              const std::unordered_set<std::string> &no_grad_vars) {
-             ParamGradInfoMap param_grad_map =
-                 AppendBackward(program_desc, target, no_grad_vars);
-             std::unordered_map<
-                 std::string, std::tuple<std::string /* grad_var_name */,
-                                         int /* block_idx */, int /* op_idx */>>
-                 retv;
-             for (auto it = param_grad_map.begin(); it != param_grad_map.end();
-                  ++it) {
-               const auto &grad_info = it->second;
-               retv[it->first] = std::make_tuple(
-                   grad_info.name_, grad_info.block_idx_, grad_info.op_idx_);
-             }
-             return retv;
-           })
-      .def("block", &ProgramDesc::MutableBlock,
-           py::return_value_policy::reference)
-      .def("num_blocks", &ProgramDesc::Size)
-      .def("serialize_to_string", SerializeMessage<ProgramDesc>)
-      .def("parse_from_string",
-           [](ProgramDesc &program_desc, const std::string &data) {
-             proto::ProgramDesc *desc = program_desc.Proto();
-             PADDLE_ENFORCE(desc->ParseFromString(data),
-                            "Fail to parse ProgramDesc from string. This could "
-                            "be a bug of Paddle.");
-           });
-}
-
-void BindBlockDesc(py::module &m) {
-  py::class_<BlockDesc>(m, "BlockDesc", "")
-      .def_property_readonly("id", &BlockDesc::ID)
-      .def_property_readonly("parent", &BlockDesc::Parent)
-      .def("append_op", &BlockDesc::AppendOp,
-           py::return_value_policy::reference)
-      .def("prepend_op", &BlockDesc::PrependOp,
-           py::return_value_policy::reference)
-      .def("remove_op", &BlockDesc::RemoveOp)
-      .def("var",
-           [](BlockDesc &self, py::bytes byte_name) {
-             std::string name = byte_name;
-             return self.Var(name);
-           },
-           py::return_value_policy::reference)
-      .def("has_var",
-           [](BlockDesc &self, py::bytes byte_name) {
-             std::string name = byte_name;
-             return self.HasVar(name);
-           })
-      .def("has_var_recursive",
-           [](BlockDesc &self, py::bytes byte_name) {
-             std::string name = byte_name;
-             return self.HasVarRecursive(name);
-           })
-      .def("find_var",
-           [](BlockDesc &self, py::bytes byte_name) {
-             std::string name = byte_name;
-             return self.FindVar(name);
-           },
-           py::return_value_policy::reference)
-      .def("find_var_recursive",
-           [](BlockDesc &self, py::bytes byte_name) {
-             std::string name = byte_name;
-             return self.FindVarRecursive(name);
-           },
-           py::return_value_policy::reference)
-      .def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference)
-      .def("op_size", &BlockDesc::OpSize)
-      .def("op", &BlockDesc::Op, py::return_value_policy::reference)
-      .def("serialize_to_string", SerializeMessage<BlockDesc>);
-}
-
-void BindVarDsec(py::module &m) {
-  py::enum_<proto::DataType>(m, "DataType", "")
-      .value("BOOL", proto::DataType::BOOL)
-      .value("INT16", proto::DataType::INT16)
-      .value("INT32", proto::DataType::INT32)
-      .value("INT64", proto::DataType::INT64)
-      .value("FP16", proto::DataType::FP16)
-      .value("FP32", proto::DataType::FP32)
-      .value("FP64", proto::DataType::FP64);
-
-  py::class_<VarDesc> var_desc(m, "VarDesc", "");
-  var_desc
-      .def("name",
-           [](const VarDesc &self) {
-             py::bytes name = self.Name();
-             return name;
-           },
-           py::return_value_policy::reference)
-      .def("set_name", &VarDesc::SetName)
-      .def("set_shape", &VarDesc::SetShape)
-      .def("set_shapes", &VarDesc::SetShapes)
-      .def("set_dtype", &VarDesc::SetDataType)
-      .def("set_dtypes", &VarDesc::SetDataTypes)
-      .def("shape", &VarDesc::GetShape, py::return_value_policy::reference)
-      .def("shapes", &VarDesc::GetShapes, py::return_value_policy::reference)
-      .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
-      .def("dtypes", &VarDesc::GetDataTypes, py::return_value_policy::reference)
-      .def("lod_level", &VarDesc::GetLoDLevel)
-      .def("lod_levels", &VarDesc::GetLoDLevels,
-           py::return_value_policy::reference)
-      .def("set_lod_level", &VarDesc::SetLoDLevel)
-      .def("set_lod_levels", &VarDesc::SetLoDLevels)
-      .def("type", &VarDesc::GetType)
-      .def("set_type", &VarDesc::SetType)
-      .def("serialize_to_string", SerializeMessage<VarDesc>)
-      .def("persistable", &VarDesc::Persistable)
-      .def("set_persistable", &VarDesc::SetPersistable);
-
-  py::enum_<proto::VarDesc::VarType>(var_desc, "VarType", "")
-      .value("LOD_TENSOR", proto::VarDesc::LOD_TENSOR)
-      .value("SELECTED_ROWS", proto::VarDesc::SELECTED_ROWS)
-      .value("FEED_MINIBATCH", proto::VarDesc::FEED_MINIBATCH)
-      .value("FETCH_LIST", proto::VarDesc::FETCH_LIST)
-      .value("STEP_SCOPES", proto::VarDesc::STEP_SCOPES)
-      .value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE)
-      .value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY)
-      .value("PLACE_LIST", proto::VarDesc::PLACE_LIST)
-      .value("READER", proto::VarDesc::READER)
-      .value("NCCL_COM", proto::VarDesc::NCCL_COM);
-}
-
-void BindOpDesc(py::module &m) {
-  py::enum_<proto::AttrType>(m, "AttrType", "")
-      .value("INT", proto::AttrType::INT)
-      .value("INTS", proto::AttrType::INTS)
-      .value("FLOAT", proto::AttrType::FLOAT)
-      .value("FLOATS", proto::AttrType::FLOATS)
-      .value("STRING", proto::AttrType::STRING)
-      .value("STRINGS", proto::AttrType::STRINGS)
-      .value("BOOL", proto::AttrType::BOOLEAN)
-      .value("BOOLS", proto::AttrType::BOOLEANS)
-      .value("BLOCK", proto::AttrType::BLOCK);
-
-  py::class_<OpDesc> op_desc(m, "OpDesc", "");
-  op_desc
-      .def("__init__", [](OpDesc &self) { new (&self) OpDesc(); },
-           py::return_value_policy::reference)
-      .def("copy_from", &OpDesc::CopyFrom)
-      .def("type", &OpDesc::Type)
-      .def("set_type", &OpDesc::SetType)
-      .def("input", &OpDesc::Input)
-      .def("input_names", &OpDesc::InputNames)
-      .def("output", &OpDesc::Output)
-      .def("output_names", &OpDesc::OutputNames)
-      .def("set_input", &OpDesc::SetInput)
-      .def("set_output", &OpDesc::SetOutput)
-      .def("input_arg_names", &OpDesc::InputArgumentNames)
-      .def("output_arg_names", &OpDesc::OutputArgumentNames)
-      .def("rename_input", &OpDesc::RenameInput)
-      .def("rename_output", &OpDesc::RenameOutput)
-      .def("has_attr", &OpDesc::HasAttr)
-      .def("attr_type", &OpDesc::GetAttrType)
-      .def("attr_names", &OpDesc::AttrNames)
-      .def("set_attr", &OpDesc::SetAttr)
-      .def("attr", &OpDesc::GetAttr)
-      .def("set_block_attr", &OpDesc::SetBlockAttr)
-      .def("set_serialized_attr",
-           [](OpDesc &self, const std::string &name,
-              const py::bytes &seriralized) {
-             std::string ser(seriralized);
-             self.SetAttr(name, ser);
-           })
-      .def("block_attr", &OpDesc::GetBlockAttr)
-      .def("check_attrs", &OpDesc::CheckAttrs)
-      .def("infer_shape", &OpDesc::InferShape)
-      .def("infer_var_type", &OpDesc::InferVarType)
-      .def("serialize_to_string", SerializeMessage<OpDesc>)
-      .def("block", &OpDesc::Block, py::return_value_policy::reference);
-}
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/pybind/protobuf.h b/paddle/pybind/protobuf.h
deleted file mode 100644
index 9e747e9ea60fd95c74937daa283bc7a9eb9368c0..0000000000000000000000000000000000000000
--- a/paddle/pybind/protobuf.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <Python.h>
-#include <fstream>
-#include <vector>
-#include "paddle/platform/variant.h"
-#include "pybind11/numpy.h"
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
-
-namespace py = pybind11;
-
-namespace paddle {
-namespace pybind {
-
-void BindProgramDesc(py::module& m);
-void BindBlockDesc(py::module& m);
-void BindVarDsec(py::module& m);
-void BindOpDesc(py::module& m);
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
deleted file mode 100644
index a880d9bdbc63aacc1f2cdbc0d7da001a59c7b372..0000000000000000000000000000000000000000
--- a/paddle/pybind/pybind.cc
+++ /dev/null
@@ -1,484 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pybind/protobuf.h"
-
-#include <mutex>  // for call_once
-#include <unordered_map>
-#include "paddle/framework/backward.h"
-#include "paddle/framework/executor.h"
-#include "paddle/framework/feed_fetch_method.h"
-#include "paddle/framework/framework.pb.h"
-#include "paddle/framework/init.h"
-#include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/prune.h"
-#include "paddle/framework/selected_rows.h"
-#include "paddle/operators/cond_op.h"
-#include "paddle/operators/net_op.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/place.h"
-#include "paddle/platform/profiler.h"
-#include "paddle/pybind/const_value.h"
-#include "paddle/pybind/exception.h"
-#include "paddle/pybind/pybind.h"
-#include "paddle/pybind/tensor_py.h"
-#include "paddle/string/to_string.h"
-
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/operators/nccl/nccl_gpu_common.h"
-#include "paddle/platform/cuda_profiler.h"
-#include "paddle/platform/gpu_info.h"
-#endif
-
-// disable auto conversion to list in Python
-PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
-
-namespace paddle {
-namespace pybind {
-static size_t UniqueIntegerGenerator(const std::string &prefix) {
-  static std::unordered_map<std::string, std::atomic<size_t>> generators;
-  return generators[prefix].fetch_add(1);
-}
-
-bool IsCompiledWithCUDA() {
-#ifndef PADDLE_WITH_CUDA
-  return false;
-#else
-  return true;
-#endif
-}
-
-PYBIND11_PLUGIN(core) {
-  py::module m("core", "C++ core of PaddlePaddle");
-
-  // using framework in this function. Since it is inside a function, it will
-  // not cause namespace pollution.
-  using namespace paddle::framework;  // NOLINT
-
-  BindException(m);
-
-  py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
-      .def_buffer(
-          [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
-      .def("get_dims",
-           [](const Tensor &self) { return vectorize(self.dims()); })
-      .def("set_dims",
-           [](Tensor &self, const std::vector<int64_t> &dim) {
-             self.Resize(make_ddim(dim));
-           })
-      .def("set_layout",
-           [](Tensor &self, const std::string &layout) {
-             self.set_layout(StringToDataLayout(layout));
-           })
-      .def("alloc_float",
-           [](Tensor &self, paddle::platform::CUDAPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("alloc_float",
-           [](Tensor &self, paddle::platform::CPUPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("alloc_int",
-           [](Tensor &self, paddle::platform::CPUPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("alloc_int",
-           [](Tensor &self, paddle::platform::CUDAPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("set", PyCPUTensorSetFromArray<float>)
-      .def("set", PyCPUTensorSetFromArray<int>)
-      .def("set", PyCPUTensorSetFromArray<double>)
-      .def("set", PyCPUTensorSetFromArray<int64_t>)
-      .def("set", PyCPUTensorSetFromArray<bool>)
-#ifdef PADDLE_WITH_CUDA
-      .def("set", PyCUDATensorSetFromArray<float>)
-      .def("set", PyCUDATensorSetFromArray<int>)
-      .def("set", PyCUDATensorSetFromArray<double>)
-      .def("set", PyCUDATensorSetFromArray<int64_t>)
-      .def("set", PyCUDATensorSetFromArray<bool>)
-#endif
-      .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
-      .def("set_float_element", TensorSetElement<float>)
-      .def("get_float_element", TensorGetElement<float>)
-      .def("set_double_element", TensorSetElement<double>)
-      .def("get_double_element", TensorGetElement<double>)
-      .def("dtype", [](Tensor &self) { return ToDataType(self.type()); });
-
-  py::class_<LoDTensor, Tensor>(m, "LoDTensor")
-      .def_buffer(
-          [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
-      .def(
-          "__init__",
-          [](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) {
-            LoD new_lod;
-            new_lod.reserve(lod.size());
-            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-            new (&instance) LoDTensor(new_lod);
-          })
-      .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
-      .def("set_lod",
-           [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
-             LoD new_lod;
-             new_lod.reserve(lod.size());
-             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             self.set_lod(new_lod);
-           })
-      .def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
-        auto lod = self.lod();
-        std::vector<std::vector<size_t>> new_lod;
-        new_lod.reserve(lod.size());
-        std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-        return new_lod;
-      });
-
-  py::class_<SelectedRows>(m, "SelectedRows")
-      .def("__init__",
-           [](SelectedRows &instance) { new (&instance) SelectedRows(); })
-      .def("__init__",
-           [](SelectedRows &instance, const std::vector<int64_t> rows,
-              const int64_t &height) {
-             new (&instance) SelectedRows(rows, height);
-           })
-      .def("get_tensor",
-           [](SelectedRows &self) { return self.mutable_value(); },
-           py::return_value_policy::reference)
-      .def("set_height", &SelectedRows::set_height)
-      .def("height", &SelectedRows::height)
-      .def("set_rows",
-           [](SelectedRows &self, std::vector<int64_t> rows) {
-#ifndef PADDLE_WITH_CUDA
-             self.set_rows(rows);
-#else
-        Vector<int64_t> new_rows(rows);
-        self.set_rows(new_rows);
-#endif
-           })
-      .def("rows", [](SelectedRows &self) {
-#ifndef PADDLE_WITH_CUDA
-        return self.rows();
-#else
-         auto rows = self.rows();
-         std::vector<int64_t> new_rows;
-         new_rows.reserve(rows.size());
-         std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
-         return new_rows;
-#endif
-      });
-
-  py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
-
-All parameter, weight, gradient are variables in Paddle.
-)DOC")
-      .def("is_int", [](const Variable &var) { return var.IsType<int>(); })
-      .def("set_int",
-           [](Variable &var, int val) -> void { *var.GetMutable<int>() = val; })
-      .def("get_int", [](const Variable &var) -> int { return var.Get<int>(); })
-      .def("is_float", [](const Variable &var) { return var.IsType<float>(); })
-      .def("set_float",
-           [](Variable &var, float val) -> void {
-             *var.GetMutable<float>() = val;
-           })
-      .def("get_float",
-           [](const Variable &var) -> float { return var.Get<float>(); })
-      .def("get_tensor",
-           [](Variable &self) -> LoDTensor * {
-             return self.GetMutable<LoDTensor>();
-           },
-           py::return_value_policy::reference)
-      .def("get_lod_rank_table",
-           [](Variable &self) { return self.GetMutable<LoDRankTable>(); },
-           py::return_value_policy::reference)
-      .def("get_selected_rows",
-           [](Variable &self) -> SelectedRows * {
-             return self.GetMutable<SelectedRows>();
-           },
-           py::return_value_policy::reference)
-      .def("get_lod_tensor_array",
-           [](Variable &self) { return self.GetMutable<LoDTensorArray>(); },
-           py::return_value_policy::reference)
-#ifdef PADDLE_WITH_CUDA
-      .def("get_communicator",
-           [](Variable &self) -> platform::Communicator * {
-             return self.GetMutable<platform::Communicator>();
-           },
-           py::return_value_policy::reference)
-#endif
-      .def("get_net",
-           [](Variable &self) -> operators::NetOp * {
-             return self.GetMutable<operators::NetOp>();
-           },
-           py::return_value_policy::reference);
-
-  py::class_<Scope>(m, "Scope", "")
-      .def("var",
-           [](Scope &self, const std::string &name) -> Variable * {
-             return self.Var(name);
-           },
-           py::return_value_policy::reference)
-      .def("find_var", &Scope::FindVar, py::return_value_policy::reference)
-      .def(py::init<>())
-      .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
-           py::return_value_policy::reference)
-      .def("drop_kids", &Scope::DropKids);
-
-  //! @note: Be careful! PyBind will return std::string as an unicode, not
-  //! Python str. If you want a str object, you should cast them in Python.
-  m.def("get_all_op_protos", []() -> std::vector<py::bytes> {
-    std::vector<py::bytes> ret_values;
-    for (auto &iter : OpInfoMap::Instance().map()) {
-      auto &info = iter.second;
-      if (info.HasOpProtoAndChecker()) {
-        std::string str;
-        PADDLE_ENFORCE(
-            info.Proto().SerializeToString(&str),
-            "Serialize OpProto Error. This could be a bug of Paddle.");
-        ret_values.emplace_back(str);
-      }
-    }
-    return ret_values;
-  });
-  m.def(
-      "get_grad_op_desc", [](const OpDesc &op_desc,
-                             const std::unordered_set<std::string> &no_grad_set,
-                             const std::vector<BlockDesc *> &grad_sub_block) {
-        std::unordered_map<std::string, std::string> grad_to_var;
-        std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
-            framework::OpInfoMap::Instance()
-                .Get(op_desc.Type())
-                .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
-                               grad_sub_block);
-        std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
-        std::transform(grad_op_descs.begin(), grad_op_descs.end(),
-                       grad_op_desc_ptrs.begin(),
-                       [](std::unique_ptr<OpDesc> &p) { return p.release(); });
-        return std::make_pair(grad_op_desc_ptrs, grad_to_var);
-      });
-  m.def("prune", [](const ProgramDesc &origin,
-                    const std::vector<std::array<size_t, 2>> &targets) {
-    ProgramDesc prog_with_targets(origin);
-    for (const auto &t : targets) {
-      prog_with_targets.MutableBlock(t[0])->Op(t[1])->MarkAsTarget();
-    }
-    proto::ProgramDesc pruned_desc;
-    Prune(*prog_with_targets.Proto(), &pruned_desc);
-    return new ProgramDesc(pruned_desc);
-  });
-  m.def("inference_optimize", [](ProgramDesc &origin) {
-    proto::ProgramDesc pruned_desc;
-    InferenceOptimize(*(origin.Proto()), &pruned_desc);
-    return new ProgramDesc(pruned_desc);
-  });
-  m.def("empty_var_name", []() { return framework::kEmptyVarName; });
-  m.def("grad_var_suffix", []() { return framework::kGradVarSuffix; });
-  m.def_submodule(
-       "var_names",
-       "The module will return special predefined variable name in Paddle")
-      .def("empty", []() { return kEmptyVarName; })
-      .def("temp", []() { return kTempVarName; });
-  // clang-format off
-  py::class_<paddle::platform::DeviceContext>(m, "DeviceContext")
-      .def_static("create",
-                  [](paddle::platform::CPUPlace& place)
-                      -> paddle::platform::DeviceContext* {
-                    return new paddle::platform::CPUDeviceContext();
-                  })
-      .def_static("create",
-                  [](paddle::platform::CUDAPlace& place)
-                      -> paddle::platform::DeviceContext* {
-#ifndef PADDLE_WITH_CUDA
-                    PADDLE_THROW("CUDAPlace is not supported in CPU device.");
-#else
-                    return new paddle::platform::CUDADeviceContext(place);
-#endif
-                  });
-// clang-format on
-
-#ifdef PADDLE_WITH_CUDA
-  py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
-#endif
-  py::class_<platform::CUDAPlace>(m, "CUDAPlace")
-      .def(py::init<int>())
-      .def("__str__", string::to_string<const platform::CUDAPlace &>);
-
-  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
-      .def(py::init<>())
-      .def("__str__", string::to_string<const platform::CPUPlace &>);
-
-  py::class_<platform::Place>(m, "Place")
-      .def(py::init<>())
-      .def("set_place",
-           [](platform::Place &self, const platform::CPUPlace &cpu_place) {
-             self = cpu_place;
-           })
-      .def("set_place",
-           [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
-             self = gpu_place;
-           });
-
-  py::class_<OperatorBase>(m, "Operator")
-      .def_static("create",
-                  [](py::bytes protobin) {
-                    proto::OpDesc desc;
-                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
-                                   "Cannot parse user input to OpDesc");
-                    PADDLE_ENFORCE(desc.IsInitialized(),
-                                   "User OpDesc is not initialized, reason %s",
-                                   desc.InitializationErrorString());
-                    return OpRegistry::CreateOp(desc);
-                  })
-      .def("backward",
-           [](const OperatorBase &forwardOp,
-              const std::unordered_set<std::string> &no_grad_vars) {
-             return Backward(forwardOp, no_grad_vars).release();
-           })
-      .def("run",
-           [](OperatorBase &self, const Scope &scope,
-              const platform::CPUPlace &place) { self.Run(scope, place); })
-      .def("run",
-           [](OperatorBase &self, const Scope &scope,
-              const platform::CUDAPlace &place) { self.Run(scope, place); })
-      .def("type",
-           [](const OperatorBase &op) -> std::string { return op.Type(); })
-      .def("outputs",
-           [](const OperatorBase &op)
-               -> std::map<std::string, std::vector<std::string>> {
-                 return op.Outputs();
-               })
-      .def("output_vars",
-           [](const OperatorBase &op) { return op.OutputVars(true); })
-      .def("inputs", [](const OperatorBase &op) { return op.Inputs(); })
-      .def("input_vars", [](const OperatorBase &op) { return op.InputVars(); })
-      .def("__str__", &OperatorBase::DebugString)
-      .def("no_intermediate_outputs",
-           [](const OperatorBase &op) { return op.OutputVars(false); })
-      .def("support_gpu", &OperatorBase::SupportGPU);
-
-  py::class_<operators::NetOp, OperatorBase>(m, "Net")
-      .def_static("create",
-                  []() -> operators::NetOp * {
-                    auto *retv = new operators::NetOp;
-                    retv->SetType("plain_net");
-                    return retv;
-                  })
-      .def("append_op", [](operators::NetOp &self,
-                           const OperatorBase &op) { self.AppendOp(op); })
-      .def("complete_add_op", &operators::NetOp::CompleteAddOp)
-      .def("complete_add_op", [](std::shared_ptr<operators::NetOp> &self) {
-        self->CompleteAddOp();
-      });
-
-  // cond_op
-  py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
-      .def_static("create",
-                  [](py::bytes protobin) -> operators::CondOp * {
-                    proto::OpDesc desc;
-                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
-                                   "Cannot parse user input to OpDesc");
-                    PADDLE_ENFORCE(desc.IsInitialized(),
-                                   "User OpDesc is not initialized, reason %s",
-                                   desc.InitializationErrorString());
-                    auto cond_op = OpRegistry::CreateOp(desc);
-                    return static_cast<operators::CondOp *>(cond_op.release());
-                  })
-      .def("set_truenet",
-           [](operators::CondOp &self, const operators::NetOp &net) -> void {
-             self.set_truenet(net.Clone());
-           })
-      .def("set_falsenet",
-           [](operators::CondOp &self, const operators::NetOp &net) -> void {
-             self.set_falsenet(net.Clone());
-           });
-
-  py::class_<framework::Executor>(m, "Executor")
-      .def(py::init<const platform::Place &>())
-      .def("run",
-           (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) &
-               Executor::Run);
-
-  m.def("unique_integer", UniqueIntegerGenerator);
-  m.def("init_gflags", framework::InitGflags);
-  m.def("init_glog", framework::InitGLOG);
-  m.def("init_devices", &framework::InitDevices);
-
-  m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
-
-  m.def("set_feed_variable", framework::SetFeedVariable);
-  m.def("get_fetch_variable", framework::GetFetchVariable);
-
-  BindProgramDesc(m);
-  BindBlockDesc(m);
-  BindVarDsec(m);
-  BindOpDesc(m);
-  BindConstValue(m);
-
-  py::class_<framework::LoDRankTable>(m, "LodRankTable")
-      .def("items", [](framework::LoDRankTable &table) {
-        std::vector<std::pair<size_t, size_t>> res;
-        for (auto &item : table.items()) {
-          res.push_back({item.index, item.length});
-        }
-        return res;
-      });
-
-  py::class_<LoDTensorArray>(m, "LoDTensorArray")
-      .def("__getitem__",
-           [](LoDTensorArray &self, size_t i) { return &self.at(i); },
-           py::return_value_policy::reference)
-      .def("__len__", [](LoDTensorArray &self) { return self.size(); })
-      .def("__setitem__",
-           [](LoDTensorArray &self, size_t i, const LoDTensor &t) {
-             PADDLE_ENFORCE_LT(i, self.size());
-             self[i].ShareDataWith(t);
-             self[i].set_lod(t.lod());
-           })
-      .def("append", [](LoDTensorArray &self, const LoDTensor &t) {
-        self.emplace_back();
-        self.back().ShareDataWith(t);
-        self.back().set_lod(t.lod());
-      });
-
-  m.def("op_support_gpu", OpSupportGPU);
-#ifdef PADDLE_WITH_CUDA
-  m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
-
-  m.def("nvprof_init", platform::CudaProfilerInit);
-  m.def("nvprof_start", platform::CudaProfilerStart);
-  m.def("nvprof_stop", platform::CudaProfilerStop);
-#endif
-
-  py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
-      .value("kDisabled", platform::ProfilerState::kDisabled)
-      .value("kCPU", platform::ProfilerState::kCPU)
-      .value("kCUDA", platform::ProfilerState::kCUDA)
-      .export_values();
-
-  py::enum_<platform::EventSortingKey>(m, "EventSortingKey", py::arithmetic())
-      .value("kDefault", platform::EventSortingKey::kDefault)
-      .value("kCalls", platform::EventSortingKey::kCalls)
-      .value("kTotal", platform::EventSortingKey::kTotal)
-      .value("kMin", platform::EventSortingKey::kMin)
-      .value("kMax", platform::EventSortingKey::kMax)
-      .value("kAve", platform::EventSortingKey::kAve)
-      .export_values();
-
-  m.def("enable_profiler", platform::EnableProfiler);
-  m.def("disable_profiler", platform::DisableProfiler);
-  m.def("reset_profiler", platform::ResetProfiler);
-  return m.ptr();
-}
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
deleted file mode 100644
index 3b5210e2b91dd697e213a0c847f73c6969cd654b..0000000000000000000000000000000000000000
--- a/paddle/pybind/tensor_py.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/memory/memcpy.h"
-#include "paddle/platform/device_context.h"
-#include "pybind11/numpy.h"
-#include "pybind11/pybind11.h"
-
-namespace py = pybind11;
-
-namespace paddle {
-
-namespace pybind {
-
-namespace details {
-
-template <bool less, size_t I, typename... ARGS>
-struct CastToPyBufferImpl;
-
-template <size_t I, typename... ARGS>
-struct CastToPyBufferImpl<false, I, ARGS...> {
-  py::buffer_info operator()(framework::Tensor &tensor) {
-    PADDLE_THROW("This type of tensor cannot be expose to Python");
-    return py::buffer_info();
-  }
-};
-
-template <size_t I, typename... ARGS>
-struct CastToPyBufferImpl<true, I, ARGS...> {
-  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
-  py::buffer_info operator()(framework::Tensor &tensor) {
-    if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
-      auto dim_vec = framework::vectorize(tensor.dims());
-      std::vector<size_t> dims_outside;
-      std::vector<size_t> strides;
-      dims_outside.resize(dim_vec.size());
-      strides.resize(dim_vec.size());
-
-      size_t prod = 1;
-      for (size_t i = dim_vec.size(); i != 0; --i) {
-        dims_outside[i - 1] = (size_t)dim_vec[i - 1];
-        strides[i - 1] = sizeof(CUR_TYPE) * prod;
-        prod *= dims_outside[i - 1];
-      }
-      framework::Tensor dst_tensor;
-      if (paddle::platform::is_gpu_place(tensor.place())) {
-#ifdef PADDLE_WITH_CUDA
-        auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
-        auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
-            tensor.dims(), platform::CPUPlace()));
-
-        platform::DeviceContextPool &pool =
-            platform::DeviceContextPool::Instance();
-        auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
-            pool.Get(tensor.place()));
-
-        paddle::platform::GpuMemcpyAsync(
-            dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
-            cudaMemcpyDeviceToHost, dev_ctx->stream());
-#else
-        PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
-#endif
-      } else if (paddle::platform::is_cpu_place(tensor.place())) {
-        dst_tensor = tensor;
-      }
-      return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
-                             py::format_descriptor<CUR_TYPE>::format(),
-                             (size_t)framework::arity(dst_tensor.dims()),
-                             dims_outside, strides);
-    } else {
-      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
-      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
-    }
-  }
-};
-}  // namespace details
-inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
-  auto buffer_info =
-      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool>()(
-          tensor);
-  return buffer_info;
-}
-
-template <typename T>
-T TensorGetElement(framework::Tensor &self, size_t offset) {
-  if (platform::is_cpu_place(self.place())) {
-    return self.data<T>()[offset];
-  } else {
-    std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
-    framework::Copy(self, platform::CPUPlace(), dst.get());
-    return dst->data<T>()[offset];
-  }
-}
-
-// TODO(dzhwinter) : fix the redundent Tensor allocate and free
-template <typename T>
-void TensorSetElement(framework::Tensor &self, size_t offset, T elem) {
-  if (platform::is_gpu_place(self.place())) {
-    std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
-    framework::Copy(self, platform::CPUPlace(), dst.get());
-    dst->data<T>()[offset] = elem;
-    framework::Copy(*dst.get(), self.place(), &self);
-
-  } else if (platform::is_cpu_place(self.place())) {
-    self.data<T>()[offset] = elem;
-  }
-}
-
-template <typename T>
-void PyCPUTensorSetFromArray(
-    framework::Tensor &self,
-    py::array_t<T, py::array::c_style | py::array::forcecast> array,
-    paddle::platform::CPUPlace &place) {
-  std::vector<int64_t> dims;
-  dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
-    dims.push_back((int)array.shape()[i]);
-  }
-
-  self.Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<T>(place);
-  std::memcpy(dst, array.data(), sizeof(T) * array.size());
-}
-
-#ifdef PADDLE_WITH_CUDA
-template <typename T>
-void PyCUDATensorSetFromArray(
-    framework::Tensor &self,
-    py::array_t<T, py::array::c_style | py::array::forcecast> array,
-    paddle::platform::CUDAPlace &place) {
-  std::vector<int64_t> dims;
-  dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
-    dims.push_back((int)array.shape()[i]);
-  }
-
-  self.Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<T>(place);
-
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto dev_ctx =
-      static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
-  paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
-                                   cudaMemcpyHostToDevice, dev_ctx->stream());
-}
-#endif
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 26ecb128eb82a69c03484846e1aab3fa5ef914de..099c20d8a6821223417a9b259f80f428b0bc6fe6 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -120,8 +120,6 @@ EOF
         make -j `nproc` gen_proto_py
         make -j `nproc` paddle_python
         make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
-        make -j `nproc` print_operators_doc
-        paddle/pybind/print_operators_doc > doc/en/html/operators.json
         popd
     fi
 
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index 4af4ac4f5e43543449ae922d7eb2a5740372f68f..486c094a6ac3c9ec260e231ba2333c718dfc164b 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -10,8 +10,6 @@ cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 make -j `nproc` gen_proto_py
 make -j `nproc` paddle_python
 make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
-make -j `nproc` print_operators_doc
-paddle/pybind/print_operators_doc > doc/en/html/operators.json
 
 # check websites for broken links
 linkchecker doc/en/html/index.html
diff --git a/paddle/string/piece.h b/paddle/string/piece.h
index f2bb6b2c76164fbe63a9b53ec41385014f240ffd..dcef9791a70884eed04078bf9f97be9617e9d2b0 100644
--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
@@ -28,7 +28,7 @@ namespace string {
 // its syntax is simple as it doesn't own/manage the string, it is
 // cheap to construct Pieces and pass them around.
 class Piece {
- public:
+public:
   static const size_t npos = static_cast<size_t>(-1);
 
   // We provide non-explicit singleton constructors so users can
@@ -55,7 +55,7 @@ class Piece {
   // Return a string that contains the copy of the referenced data.
   std::string ToString() const { return std::string(data_, size_); }
 
- private:
+private:
   const char* data_;
   size_t size_;
 
diff --git a/paddle/string/printf_test.cc b/paddle/string/printf_test.cc
index b5ad35513bdcbccd5dbf951a3a8e5422f6424b26..9815f29bdd82df9062dd5cacf9c3121bb65e3be9 100644
--- a/paddle/string/printf_test.cc
+++ b/paddle/string/printf_test.cc
@@ -24,6 +24,6 @@ TEST(StringPrintf, StringPrintf) {
   long hour = 14;
   int min = 44;
   EXPECT_EQ(std::string("Wednesday, July 27, 14:44"),
-            paddle::string::Sprintf("%s, %s %d, %.2d:%.2d", weekday, month, day,
-                                    hour, min));
+            paddle::string::Sprintf(
+                "%s, %s %d, %.2d:%.2d", weekday, month, day, hour, min));
 }
diff --git a/paddle/string/tinyformat/tinyformat.h b/paddle/string/tinyformat/tinyformat.h
index d1a2c47f1a9f258aefbdcce88513e05dda308fa0..270198dc52c14175f7ab91395668a7828fe2f075 100644
--- a/paddle/string/tinyformat/tinyformat.h
+++ b/paddle/string/tinyformat/tinyformat.h
@@ -147,7 +147,7 @@ namespace detail {
 // Test whether type T1 is convertible to type T2
 template <typename T1, typename T2>
 struct is_convertible {
- private:
+private:
   // two types of different size
   struct fail {
     char dummy[2];
@@ -160,7 +160,7 @@ struct is_convertible {
   static succeed tryConvert(const T2 &);
   static const T1 &makeT1();
 
- public:
+public:
   // Standard trick: the (...) version of tryConvert will be chosen from
   // the overload set only if the version taking a T2 doesn't match.
   // Then we compare the sizes of the return types to check which
@@ -170,7 +170,8 @@ struct is_convertible {
 
 // Format the value by casting to type fmtT.  This default implementation
 // should never be called.
-template <typename T, typename fmtT,
+template <typename T,
+          typename fmtT,
           bool convertible = is_convertible<T, fmtT>::value>
 struct formatValueAsType {
   static void invoke(std::ostream & /*out*/, const T & /*value*/) { assert(0); }
@@ -240,8 +241,11 @@ TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(char)
 /// operator<< to format the type T, with special cases for the %c and %p
 /// conversions.
 template <typename T>
-inline void formatValue(std::ostream &out, const char * /*fmtBegin*/,
-                        const char *fmtEnd, int ntrunc, const T &value) {
+inline void formatValue(std::ostream &out,
+                        const char * /*fmtBegin*/,
+                        const char *fmtEnd,
+                        int ntrunc,
+                        const T &value) {
   // The mess here is to support the %c and %p conversions: if these
   // conversions are active we try to convert the type to a char or const
   // void* respectively and format that instead of the value itself.  For the
@@ -263,22 +267,25 @@ inline void formatValue(std::ostream &out, const char * /*fmtBegin*/,
 }
 
 // Overloaded version for char types to support printing as an integer
-#define TINYFORMAT_DEFINE_FORMATVALUE_CHAR(charType)                      \
-  inline void formatValue(std::ostream &out, const char * /*fmtBegin*/,   \
-                          const char *fmtEnd, int /**/, charType value) { \
-    switch (*(fmtEnd - 1)) {                                              \
-      case 'u':                                                           \
-      case 'd':                                                           \
-      case 'i':                                                           \
-      case 'o':                                                           \
-      case 'X':                                                           \
-      case 'x':                                                           \
-        out << static_cast<int>(value);                                   \
-        break;                                                            \
-      default:                                                            \
-        out << value;                                                     \
-        break;                                                            \
-    }                                                                     \
+#define TINYFORMAT_DEFINE_FORMATVALUE_CHAR(charType) \
+  inline void formatValue(std::ostream &out,         \
+                          const char * /*fmtBegin*/, \
+                          const char *fmtEnd,        \
+                          int /**/,                  \
+                          charType value) {          \
+    switch (*(fmtEnd - 1)) {                         \
+      case 'u':                                      \
+      case 'd':                                      \
+      case 'i':                                      \
+      case 'o':                                      \
+      case 'X':                                      \
+      case 'x':                                      \
+        out << static_cast<int>(value);              \
+        break;                                       \
+      default:                                       \
+        out << value;                                \
+        break;                                       \
+    }                                                \
   }
 // per 3.9.1: char, signed char and unsigned char are all distinct types
 TINYFORMAT_DEFINE_FORMATVALUE_CHAR(char)
@@ -475,7 +482,7 @@ namespace detail {
 // each argument to be allocated as a homogenous array inside FormatList
 // whereas a naive implementation based on inheritance does not.
 class FormatArg {
- public:
+public:
   FormatArg() {}
 
   template <typename T>
@@ -484,17 +491,22 @@ class FormatArg {
         m_formatImpl(&formatImpl<T>),
         m_toIntImpl(&toIntImpl<T>) {}
 
-  void format(std::ostream &out, const char *fmtBegin, const char *fmtEnd,
+  void format(std::ostream &out,
+              const char *fmtBegin,
+              const char *fmtEnd,
               int ntrunc) const {
     m_formatImpl(out, fmtBegin, fmtEnd, ntrunc, m_value);
   }
 
   int toInt() const { return m_toIntImpl(m_value); }
 
- private:
+private:
   template <typename T>
-  static void formatImpl(std::ostream &out, const char *fmtBegin,
-                         const char *fmtEnd, int ntrunc, const void *value) {
+  static void formatImpl(std::ostream &out,
+                         const char *fmtBegin,
+                         const char *fmtEnd,
+                         int ntrunc,
+                         const void *value) {
     formatValue(out, fmtBegin, fmtEnd, ntrunc, *static_cast<const T *>(value));
   }
 
@@ -504,8 +516,11 @@ class FormatArg {
   }
 
   const void *m_value;
-  void (*m_formatImpl)(std::ostream &out, const char *fmtBegin,
-                       const char *fmtEnd, int ntrunc, const void *value);
+  void (*m_formatImpl)(std::ostream &out,
+                       const char *fmtBegin,
+                       const char *fmtEnd,
+                       int ntrunc,
+                       const void *value);
   int (*m_toIntImpl)(const void *value);
 };
 
@@ -554,10 +569,12 @@ inline const char *printFormatStringLiteral(std::ostream &out,
 // necessary to pull out variable width and precision .  The function returns a
 // pointer to the character after the end of the current format spec.
 inline const char *streamStateFromFormat(std::ostream &out,
-                                         bool &spacePadPositive, int &ntrunc,
+                                         bool &spacePadPositive,
+                                         int &ntrunc,
                                          const char *fmtStart,
                                          const detail::FormatArg *formatters,
-                                         int &argIndex, int numFormatters) {
+                                         int &argIndex,
+                                         int numFormatters) {
   if (*fmtStart != '%') {
     TINYFORMAT_ERROR(
         "tinyformat: Not enough conversion specifiers in format string");
@@ -733,8 +750,10 @@ inline const char *streamStateFromFormat(std::ostream &out,
 }
 
 //------------------------------------------------------------------------------
-inline void formatImpl(std::ostream &out, const char *fmt,
-                       const detail::FormatArg *formatters, int numFormatters) {
+inline void formatImpl(std::ostream &out,
+                       const char *fmt,
+                       const detail::FormatArg *formatters,
+                       int numFormatters) {
   // Saved stream state
   std::streamsize origWidth = out.width();
   std::streamsize origPrecision = out.precision();
@@ -746,9 +765,13 @@ inline void formatImpl(std::ostream &out, const char *fmt,
     fmt = printFormatStringLiteral(out, fmt);
     bool spacePadPositive = false;
     int ntrunc = -1;
-    const char *fmtEnd =
-        streamStateFromFormat(out, spacePadPositive, ntrunc, fmt, formatters,
-                              argIndex, numFormatters);
+    const char *fmtEnd = streamStateFromFormat(out,
+                                               spacePadPositive,
+                                               ntrunc,
+                                               fmt,
+                                               formatters,
+                                               argIndex,
+                                               numFormatters);
     if (argIndex >= numFormatters) {
       // Check args remain after reading any variable width/precision
       TINYFORMAT_ERROR("tinyformat: Not enough format arguments");
@@ -797,14 +820,15 @@ inline void formatImpl(std::ostream &out, const char *fmt,
 /// information has been stripped from the arguments, leaving just enough of a
 /// common interface to perform formatting as required.
 class FormatList {
- public:
+public:
   FormatList(detail::FormatArg *formatters, int N)
       : m_formatters(formatters), m_N(N) {}
 
-  friend void vformat(std::ostream &out, const char *fmt,
+  friend void vformat(std::ostream &out,
+                      const char *fmt,
                       const FormatList &list);
 
- private:
+private:
   const detail::FormatArg *m_formatters;
   int m_N;
 };
@@ -817,7 +841,7 @@ namespace detail {
 // Format list subclass with fixed storage to avoid dynamic allocation
 template <int N>
 class FormatListN : public FormatList {
- public:
+public:
   template <typename... Args>
   FormatListN(const Args &... args)
       : FormatList(&m_formatterStore[0], N),
@@ -825,14 +849,14 @@ class FormatListN : public FormatList {
     static_assert(sizeof...(args) == N, "Number of args must be N");
   }
 
- private:
+private:
   FormatArg m_formatterStore[N];
 };
 
 // Special 0-arg version - MSVC says zero-sized C array in struct is nonstandard
 template <>
 class FormatListN<0> : public FormatList {
- public:
+public:
   FormatListN() : FormatList(0, 0) {}
 };
 
diff --git a/paddle/string/to_string_test.cc b/paddle/string/to_string_test.cc
index 4956bd96fad5fd1decaad0a367135cb7d7ecaf6e..05650ee8f14f2b050dec07803cdbfa3893eb79d7 100644
--- a/paddle/string/to_string_test.cc
+++ b/paddle/string/to_string_test.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 
 constexpr char kOutputString[] = "User Defined Output";
 class UserDefinedClass {
- public:
+public:
 };
 
 std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) {
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index fd8c4a69da897cc39f31f435036e32c41285fb59..270f2f4c181df847348be12a199534d47b3577f5 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -16,10 +16,11 @@ limitations under the License. */
 
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
-#include "paddle/framework/init.h"
-#include "paddle/memory/memory.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/memory/memory.h"
 
 int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
   std::vector<char*> new_argv;
   std::string gflags_env;
   for (int i = 0; i < argc; ++i) {
@@ -35,7 +36,6 @@ int main(int argc, char** argv) {
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();
   google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
-  testing::InitGoogleTest(&argc, argv);
   paddle::memory::Used(paddle::platform::CPUPlace());
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index 3ee58393c72c0b6f9bec96be51ad3946752a35dd..73acbf3e009965f9eaaade77d2fe4cf4f99d4379 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -29,7 +29,7 @@ import optimizer
 import learning_rate_decay
 import backward
 import regularizer
-from param_attr import ParamAttr
+from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, CUDAPlace
 from distribute_transpiler import DistributeTranspiler
@@ -41,11 +41,26 @@ import profiler
 Tensor = LoDTensor
 
 __all__ = framework.__all__ + executor.__all__ + [
-    'io', 'initializer', 'layers', 'nets', 'optimizer', 'learning_rate_decay',
-    'backward', 'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor',
-    'ParamAttr'
-    'DataFeeder', 'clip', 'SimpleDistributeTranspiler', 'DistributeTranspiler',
-    'memory_optimize', 'profiler'
+    'io',
+    'initializer',
+    'layers',
+    'nets',
+    'optimizer',
+    'learning_rate_decay',
+    'backward',
+    'regularizer',
+    'LoDTensor',
+    'CPUPlace',
+    'CUDAPlace',
+    'Tensor',
+    'ParamAttr',
+    'WeightNormParamAttr',
+    'DataFeeder',
+    'clip',
+    'SimpleDistributeTranspiler',
+    'DistributeTranspiler',
+    'memory_optimize',
+    'profiler',
 ]
 
 
diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py
index 121b407cae41fa477843b7252ebacc9053d5f7aa..cd89dba72db2470c1d5bafd4523c21c49e7e2b53 100644
--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -300,6 +300,9 @@ class DistributeTranspiler:
             pass
         return orig_shape
 
+    def _op_input_var(self, op, varname):
+        pass
+
     def _is_op_on_pserver(self, endpoint, all_ops, idx):
         """
         Recursively check if the op need to run on current server.
@@ -309,44 +312,51 @@ class DistributeTranspiler:
             p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
         ]
         op = all_ops[idx]
-        if op.inputs.has_key("Param"):
-            if op.inputs["Param"].name in param_names:
+        input_names = set(op.input_names)
+        # TODO(typhoonzero): using Param and Grad input name to identify
+        # that the operator is an optimization operator, need a better way.
+        if "Param" in input_names:
+            if op.input("Param")[0] in param_names:
                 return True
             else:
                 for n in param_names:
-                    if same_or_split_var(n, op.inputs[
-                            "Param"].name) and n != op.inputs["Param"].name:
+                    if same_or_split_var(n, op.input("Param")[0]) \
+                            and n != op.input("Param")[0]:
                         return True
                 return False
         else:
             j = idx - 1
             while j >= 0:
                 prev_op = all_ops[j]
-                prev_output_names = [o.name for o in prev_op.outputs.values()]
-                prev_input_names = [o.name for o in prev_op.inputs.values()]
+                # prev_output_names = [o.name for o in prev_op.outputs.values()]
+                # prev_input_names = [o.name for o in prev_op.inputs.values()]
+                # NOTE(typhoonzero): consider list input/output
+                prev_output_names = prev_op.desc.output_arg_names()
+                prev_input_names = prev_op.desc.input_arg_names()
                 found1 = False
                 found2 = False
-                for _, v in op.inputs.iteritems():
-                    if v.name in prev_output_names:
+                for varname in op.desc.input_arg_names():
+                    if varname in prev_output_names:
                         found1 = self._is_op_on_pserver(endpoint, all_ops, j)
                 # later ops may produce output for prev op's next batch use.
-                for _, v in op.outputs.iteritems():
-                    if v.name in prev_input_names:
+                for varname in op.desc.output_arg_names():
+                    if varname in prev_input_names:
                         found2 = self._is_op_on_pserver(endpoint, all_ops, j)
                 if found1 or found2:
                     return True
                 j -= 1
             return False
 
-    def _append_pserver_ops(self, program, pserver_program, opt_op, endpoint):
+    def _append_pserver_ops(self, optimize_block, opt_op, endpoint):
+        program = optimize_block.program
         new_inputs = dict()
         # update param/grad shape first, then other inputs like
         # moment can use the updated shape
-        for key, var in opt_op.inputs.iteritems():
+        for key in opt_op.input_names:
             if key == "Grad":
                 grad_block = None
                 for g in self.param_grad_ep_mapping[endpoint]["grads"]:
-                    if same_or_split_var(g.name, var.name):
+                    if same_or_split_var(g.name, opt_op.input(key)[0]):
                         grad_block = g
                         break
                 if not grad_block:
@@ -362,11 +372,11 @@ class DistributeTranspiler:
                 if self.trainers > 1:
                     vars2merge = self._create_var_for_trainers(
                         program.global_block(), grad_block, self.trainers)
-                    program.global_block().append_op(
+                    optimize_block.append_op(
                         type="sum",
                         inputs={"X": vars2merge},
                         outputs={"Out": merged_var})
-                    program.global_block().append_op(
+                    optimize_block.append_op(
                         type="scale",
                         inputs={"X": merged_var},
                         outputs={"Out": merged_var},
@@ -376,7 +386,7 @@ class DistributeTranspiler:
                 # param is already created on global program
                 param_block = None
                 for p in self.param_grad_ep_mapping[endpoint]["params"]:
-                    if same_or_split_var(p.name, var.name):
+                    if same_or_split_var(p.name, opt_op.input(key)[0]):
                         param_block = p
                         break
                 if not param_block:
@@ -389,11 +399,12 @@ class DistributeTranspiler:
 
                 new_inputs[key] = tmpvar
 
-        for key, var in opt_op.inputs.iteritems():
+        for key in opt_op.input_names:
             if key in ["Param", "Grad"]:
                 continue
             # update accumulator variable shape
             param_shape = new_inputs["Param"].shape
+            var = program.global_block().vars[opt_op.input(key)[0]]
             new_shape = self._get_optimizer_input_shape(opt_op.type, key,
                                                         var.shape, param_shape)
             tmpvar = program.global_block().create_var(
@@ -402,40 +413,41 @@ class DistributeTranspiler:
                 dtype=var.dtype,
                 shape=new_shape)
             new_inputs[key] = tmpvar
-            # create var in pserver program global block.
-            # TODO(typhoonzero): put blocks in one program to avoid create two
-            # variables.
-            pserver_program.global_block().create_var(
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=new_shape)
 
         # change output's ParamOut variable
-        opt_op.outputs["ParamOut"] = new_inputs["Param"]
-        program.global_block().append_op(
+        outputs = self._get_output_map_from_op(program.global_block(), opt_op)
+        outputs["ParamOut"] = new_inputs["Param"]
+        optimize_block.append_op(
             type=opt_op.type,
             inputs=new_inputs,
-            outputs=opt_op.outputs,
+            outputs=outputs,
             attrs=opt_op.attrs)
 
-    def _append_pserver_non_opt_ops(self, program, pserver_program, opt_op):
+    def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
+        program = optimize_block.program
         # Append the ops for parameters that do not need to be optimized/updated
-        for _, var in opt_op.inputs.iteritems():
-            program.global_block().create_var(
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=var.shape)
-            pserver_program.global_block().create_var(
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=var.shape)
-        program.global_block().append_op(
+        inputs = self._get_input_map_from_op(self.program.global_block().vars,
+                                             opt_op)
+        for var in inputs.itervalues():
+            if type(var) == list:
+                varlist = var
+            else:
+                varlist = [var]
+            for var in varlist:
+                if not program.global_block().vars.has_key(var.name):
+                    program.global_block().create_var(
+                        name=var.name,
+                        persistable=var.persistable,
+                        dtype=var.dtype,
+                        shape=var.shape)
+
+        outputs = self._get_output_map_from_op(self.program.global_block().vars,
+                                               opt_op)
+
+        optimize_block.append_op(
             type=opt_op.type,
-            inputs=opt_op.inputs,
-            outputs=opt_op.outputs,
+            inputs=inputs,
+            outputs=outputs,
             attrs=opt_op.attrs)
 
     def get_pserver_program(self, endpoint):
@@ -465,26 +477,25 @@ class DistributeTranspiler:
                     dtype=v.dtype,
                     shape=v.shape)
         # step6
-        optimize_sub_program = Program()
+        optimize_block = pserver_program.create_block(0)
         # Iterate through the ops and append ops as needed
         for idx, opt_op in enumerate(self.optimize_ops):
             is_op_on_pserver = self._is_op_on_pserver(endpoint,
                                                       self.optimize_ops, idx)
             if not is_op_on_pserver:
                 continue
-            if opt_op.inputs.has_key("Grad"):
-                self._append_pserver_ops(optimize_sub_program, pserver_program,
-                                         opt_op, endpoint)
+            if "Grad" in opt_op.desc.input_arg_names():
+                self._append_pserver_ops(optimize_block, opt_op, endpoint)
             else:
-                self._append_pserver_non_opt_ops(optimize_sub_program,
-                                                 pserver_program, opt_op)
+                self._append_pserver_non_opt_ops(optimize_block, opt_op)
+
         # Append the listen_and_serv op
         pserver_program.global_block().append_op(
             type="listen_and_serv",
             inputs={},
             outputs={},
             attrs={
-                "OptimizeBlock": optimize_sub_program.global_block(),
+                "OptimizeBlock": optimize_block,
                 "endpoint": endpoint,
                 "ParamList": [
                     p.name
@@ -499,6 +510,30 @@ class DistributeTranspiler:
         pserver_program.sync_with_cpp()
         return pserver_program
 
+    def _get_input_map_from_op(self, varmap, op):
+        iomap = dict()
+        for key in op.input_names:
+            vars = []
+            for varname in op.input(key):
+                vars.append(varmap[varname])
+            if len(vars) == 1:
+                iomap[key] = vars[0]
+            else:
+                iomap[key] = vars
+        return iomap
+
+    def _get_output_map_from_op(self, varmap, op):
+        iomap = dict()
+        for key in op.output_names:
+            vars = []
+            for varname in op.output(key):
+                vars.append(varmap[varname])
+            if len(vars) == 1:
+                iomap[key] = vars[0]
+            else:
+                iomap[key] = vars
+        return iomap
+
     def get_startup_program(self, endpoint, pserver_program):
         """
         Get startup program for current parameter server.
@@ -529,17 +564,21 @@ class DistributeTranspiler:
 
         # 2. rename op outputs
         for op in orig_s_prog.global_block().ops:
+            new_inputs = dict()
             new_outputs = dict()
             # do not append startup op if var is not on this pserver
             op_on_pserver = False
-            for key, var in op.outputs.iteritems():
-                newname, _ = _get_splited_name_and_shape(var.name)
+            for key in op.output_names:
+                newname, _ = _get_splited_name_and_shape(op.output(key)[0])
                 if newname:
                     op_on_pserver = True
                     new_outputs[key] = created_var_map[newname]
-                elif var.name in pserver_vars:
+                elif op.output(key)[0] in pserver_vars:
                     op_on_pserver = True
-                    new_outputs[key] = pserver_vars[var.name]
+                    new_outputs[key] = pserver_vars[op.output(key)[0]]
+
+            # most startup program ops have no inputs
+            new_inputs = self._get_input_map_from_op(pserver_vars, op)
 
             if op_on_pserver:
                 if op.type in [
@@ -548,7 +587,7 @@ class DistributeTranspiler:
                     op.attrs["shape"] = new_outputs["Out"].shape
                 s_prog.global_block().append_op(
                     type=op.type,
-                    inputs=op.inputs,
+                    inputs=new_inputs,
                     outputs=new_outputs,
                     attrs=op.attrs)
         return s_prog
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
index 0eddcc3a5ab6f71aa5500c3b98b63c0937c7ddfc..01cbdb3ec487d6e2e60890619131de0067d40db9 100644
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -47,27 +47,13 @@ def as_numpy(tensor):
         return [as_numpy(t) for t in tensor]
     assert isinstance(tensor, core.LoDTensor)
     lod = tensor.lod()
-    tensor_data = np.array(tensor)
-    if len(lod) == 0:
-        ans = tensor_data
-    else:
-        raise RuntimeError("LoD Calculate lacks unit tests and buggy")
-    # elif len(lod) == 1:
-    #     ans = []
-    #     idx = 0
-    #     while idx < len(lod) - 1:
-    #         ans.append(tensor_data[lod[idx]:lod[idx + 1]])
-    #         idx += 1
-    # else:
-    #     for l in reversed(lod):
-    #         ans = []
-    #         idx = 0
-    #         while idx < len(l) - 1:
-    #             ans.append(tensor_data[l[idx]:l[idx + 1]])
-    #             idx += 1
-    #         tensor_data = ans
-    #     ans = tensor_data
-    return ans
+    if len(lod) > 0:
+        raise RuntimeError(
+            "Some of your featched tensors hold LoD information. \
+            They can not be completely cast to Python ndarray. \
+            Please set the parameter 'return_numpy' as 'False' to \
+            return LoDTensor itself directly.")
+    return np.array(tensor)
 
 
 def has_feed_operators(block, feed_targets, feed_holder_name):
@@ -306,7 +292,6 @@ class Executor(object):
             core.get_fetch_variable(scope, fetch_var_name, i)
             for i in xrange(len(fetch_list))
         ]
-
         if return_numpy:
             outs = as_numpy(outs)
         return outs
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index 36f20d9155f281ad3481a5f327c1f87ab08b63cf..8e17f5324878a05376918f3e719c893b08531ed4 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -740,6 +740,9 @@ class Block(object):
             raise e
         self.desc.remove_op(start, end + 1)
 
+    def slice_ops(self, start, end):
+        return list(self.ops)[start:end]
+
     def prepend_op(self, *args, **kwargs):
         op_desc = self.desc.prepend_op()
         op = Operator(self, op_desc, *args, **kwargs)
diff --git a/python/paddle/v2/fluid/initializer.py b/python/paddle/v2/fluid/initializer.py
index b9c0d12ad6cf09e66df6b1a8da09df275c79a3f6..8c70fd90eff6d40c77ea4b74f5b04d6ab62401be 100644
--- a/python/paddle/v2/fluid/initializer.py
+++ b/python/paddle/v2/fluid/initializer.py
@@ -14,14 +14,37 @@
 
 import framework
 import numpy as np
+import contextlib
 
 __all__ = [
-    'Constant',
-    'Uniform',
-    'Normal',
-    'Xavier',
+    'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu',
+    'init_on_cpu'
 ]
 
+_force_init_on_cpu_ = False
+
+
+def force_init_on_cpu():
+    return _force_init_on_cpu_
+
+
+@contextlib.contextmanager
+def init_on_cpu():
+    """
+    Switch program with `with` statement
+
+    Examples:
+        >>> with init_on_cpu():
+        >>>   step = layers.create_global_var()
+
+    """
+    global _force_init_on_cpu_
+
+    pre_state = force_init_on_cpu()
+    _force_init_on_cpu_ = True
+    yield
+    _force_init_on_cpu_ = pre_state
+
 
 class Initializer(object):
     """Base class for variable initializers
@@ -80,7 +103,7 @@ class ConstantInitializer(Initializer):
     """Implements the constant initializer
     """
 
-    def __init__(self, value=0.0):
+    def __init__(self, value=0.0, force_cpu=False):
         """Constructor for ConstantInitializer
 
         Args:
@@ -89,6 +112,7 @@ class ConstantInitializer(Initializer):
         assert value is not None
         super(ConstantInitializer, self).__init__()
         self._value = value
+        self._force_cpu = force_cpu
 
     def __call__(self, var, block):
         """Add constant initialization ops for a variable
@@ -110,7 +134,8 @@ class ConstantInitializer(Initializer):
             attrs={
                 "shape": var.shape,
                 "dtype": int(var.dtype),
-                "value": self._value
+                "value": float(self._value),
+                'force_cpu': self._force_cpu or force_init_on_cpu()
             })
         var.op = op
         return op
diff --git a/python/paddle/v2/fluid/layers/math_op_patch.py b/python/paddle/v2/fluid/layers/math_op_patch.py
index 79a130a3eb148e6c5a8fa3cdf174780b354c23c9..9b5f22759cf5ccba841919ec158ff58e6505dcf7 100644
--- a/python/paddle/v2/fluid/layers/math_op_patch.py
+++ b/python/paddle/v2/fluid/layers/math_op_patch.py
@@ -14,6 +14,7 @@
 
 from ..framework import Variable, unique_name
 from layer_function_generator import OpProtoHolder
+from ..initializer import force_init_on_cpu
 
 __all__ = ['monkey_patch_variable']
 
@@ -36,9 +37,12 @@ def monkey_patch_variable():
         block.append_op(
             type="fill_constant",
             outputs={'Out': [var]},
-            attrs={'dtype': var.dtype,
-                   'shape': shape,
-                   'value': value})
+            attrs={
+                'dtype': var.dtype,
+                'shape': shape,
+                'value': value,
+                'force_cpu': force_init_on_cpu()
+            })
         return var
 
     def create_scalar(block, value, dtype):
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 99168ecc228045a0206aff1b7de5fc17c1438fe2..5ebd329fc0285a39111a23b3c58c80944cfe23f6 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -65,6 +65,7 @@ __all__ = [
     'beam_search',
     'row_conv',
     'multiplex',
+    'layer_norm',
 ]
 
 
@@ -184,7 +185,7 @@ def fc(input,
         helper.append_op(
             type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
     # add bias
-    pre_activation = helper.append_bias_op(pre_bias)
+    pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
     # add activation
     return helper.append_activation(pre_activation)
 
@@ -641,8 +642,8 @@ def dynamic_gru(input,
             Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh".
 
     Returns:
-        Variable: The hidden state of GRU. The shape is (T \\times D), and lod \
-            is the same with the input.
+        Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \
+            and lod is the same with the input.
 
     Examples:
         .. code-block:: python
@@ -990,7 +991,7 @@ def square_error_cost(input, label, **kwargs):
        label(Variable): Label tensor, has target labels.
 
     Returns:
-        Variable: The tensor variable storing the element-wise squared error
+        Variable: The tensor variable storing the element-wise squared error \
                   difference of input and label.
 
     Examples:
@@ -1214,7 +1215,7 @@ def conv2d(input,
        act(str): Activation type. Default: None
 
     Returns:
-        Variable: The tensor variable storing the convolution and
+        Variable: The tensor variable storing the convolution and \
                   non-linearity activation result.
 
     Raises:
@@ -1565,6 +1566,102 @@ def batch_norm(input,
     return helper.append_activation(batch_norm_out)
 
 
+def layer_norm(input,
+               scale=True,
+               shift=True,
+               begin_norm_axis=1,
+               epsilon=1e-05,
+               param_attr=None,
+               bias_attr=None,
+               act=None,
+               name=None):
+    """
+    **Layer Normalization**
+
+    Assume feature vectors exist on dimensions 
+    :attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
+    along these dimensions for each feature vector :math:`a` with size
+    :math:`H`, then normalize each feature vector using the corresponding
+    statistics. After that, apply learnable gain and bias on the normalized
+    tensor to scale and shift if :attr:`scale` and :attr:`shift` are set.
+
+    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
+
+    The formula is as follows:
+
+    .. math::
+
+        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
+
+        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2}
+
+        h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
+
+    Args:
+        input(Variable): The input tensor variable.
+        scale(bool): Whether to learn the adaptive gain :math:`g` after 
+            normalization.
+        shift(bool): Whether to learn the adaptive bias :math:`b` after 
+            normalization.
+        begin_norm_axis(bool): The normalization will be performed along 
+            dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
+        epsilon(float): The small value added to the variance to prevent 
+            division by zero.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            gain :math:`g`.
+        bias_attr(ParamAttr|None): The parameter attribute for the learnable
+            bias :math:`b`.
+        act(str): Activation to be applied to the output of layer normalizaiton.
+
+    Returns:
+        Variable: A tensor variable with the same shape as the input.
+
+    Examples:
+        .. code-block:: python
+
+            data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+            x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
+    """
+    helper = LayerHelper('layer_norm', **locals())
+    dtype = helper.input_dtype()
+
+    # create intput and parameters
+    inputs = {'X': input}
+    input_shape = input.shape
+    param_shape = [reduce(lambda x, y: x * y, input_shape[begin_norm_axis:])]
+    if scale:
+        scale = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=param_shape,
+            dtype=dtype,
+            default_initializer=Constant(1.0))
+        inputs['Scale'] = scale
+    if shift:
+        assert bias_attr is not False
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
+        inputs['Bias'] = bias
+
+    # create output
+    mean_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    variance_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    layer_norm_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="layer_norm",
+        inputs=inputs,
+        outputs={
+            "Y": layer_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={"epsilon": epsilon,
+               "begin_norm_axis": begin_norm_axis})
+
+    return helper.append_activation(layer_norm_out)
+
+
 def beam_search_decode(ids, scores, name=None):
     helper = LayerHelper('beam_search_decode', **locals())
     sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py
index 38dea2892fc18a9c493878d816a246522e9b9886..bb3f71abbb00793a32b8c288ee3ea255cc9c584d 100644
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -65,6 +65,8 @@ __all__ = [
     'logical_or',
     'logical_xor',
     'logical_not',
+    'uniform_random',
+    'cumsum',
 ] + __activations__
 
 for _OP in set(__all__):
diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py
index 704e040b9f478ef61991cfbe175f1cdeaf102763..2d4e0ab0cc66861d8bd31e196740688ab8fa6262 100644
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -16,7 +16,7 @@ from ..layer_helper import LayerHelper
 from ..param_attr import ParamAttr
 from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
-from ..initializer import Constant
+from ..initializer import Constant, force_init_on_cpu
 from ..core import DataType
 import numpy
 
@@ -69,12 +69,30 @@ def create_parameter(shape,
                                    default_initializer)
 
 
-def create_global_var(shape, value, dtype, persistable=False, name=None):
+def create_global_var(shape,
+                      value,
+                      dtype,
+                      persistable=False,
+                      force_cpu=False,
+                      name=None):
+    """
+    Create a global variable. such as global_step
+    Args:
+        shape(list[int]): shape of the variable
+        value(float): the value of the variable
+        dtype(string): element type of the parameter
+        persistable(bool): if this variable is persistable
+        force_cpu(bool): force this variable to be on CPU
+
+    Returns:
+        Variable: the created Variable
+    """
     helper = LayerHelper("global_var", **locals())
     var = helper.create_global_variable(
         dtype=dtype, shape=shape, persistable=persistable, name=name)
     helper.set_variable_initializer(
-        var, initializer=Constant(value=float(value)))
+        var, initializer=Constant(
+            value=float(value), force_cpu=force_cpu))
     return var
 
 
@@ -221,6 +239,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
         dtype(np.dtype|core.DataType|str): Data type of the output tensor.
         value(float): The constant value used to initialize the output tensor.
         out(Variable): The output tensor.
+        force_cpu(True|False): data should be on CPU if set true.
 
     Returns:
         Variable: The tensor variable storing the output.
@@ -242,7 +261,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
             'shape': shape,
             'dtype': out.dtype,
             'value': float(value),
-            'force_cpu': force_cpu
+            'force_cpu': force_cpu or force_init_on_cpu()
         })
     out.stop_gradient = True
     return out
diff --git a/python/paddle/v2/fluid/learning_rate_decay.py b/python/paddle/v2/fluid/learning_rate_decay.py
index 13dc98075f7d32f9dda56a890b98451ef81af363..2a2a29fd9cbedc138dc82ca75ccd78208fd33195 100644
--- a/python/paddle/v2/fluid/learning_rate_decay.py
+++ b/python/paddle/v2/fluid/learning_rate_decay.py
@@ -14,6 +14,7 @@
 
 import layers
 from framework import Variable
+from initializer import init_on_cpu
 
 __all__ = [
     'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
@@ -54,11 +55,14 @@ def exponential_decay(learning_rate,
     if not isinstance(global_step, Variable):
         raise ValueError("global_step is required for exponential_decay.")
 
-    # update learning_rate
-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = layers.floor(x=div_res)
-    return learning_rate * (decay_rate**div_res)
+    with init_on_cpu():
+        # update learning_rate
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = layers.floor(x=div_res)
+        decayed_lr = learning_rate * (decay_rate**div_res)
+
+    return decayed_lr
 
 
 def natural_exp_decay(learning_rate,
@@ -88,10 +92,13 @@ def natural_exp_decay(learning_rate,
     if not isinstance(global_step, Variable):
         raise ValueError("global_step is required for natural_exp_decay.")
 
-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = layers.floor(x=div_res)
-    return learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
+    with init_on_cpu():
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = layers.floor(x=div_res)
+        decayed_lr = learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
+
+    return decayed_lr
 
 
 def inverse_time_decay(learning_rate,
@@ -121,11 +128,14 @@ def inverse_time_decay(learning_rate,
     if not isinstance(global_step, Variable):
         raise ValueError("global_step is required for inverse_time_decay.")
 
-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = layers.floor(x=div_res)
+    with init_on_cpu():
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = layers.floor(x=div_res)
+
+        decayed_lr = learning_rate / (1 + decay_rate * div_res)
 
-    return learning_rate / (1 + decay_rate * div_res)
+    return decayed_lr
 
 
 def polynomial_decay(learning_rate,
@@ -160,22 +170,27 @@ def polynomial_decay(learning_rate,
     if not isinstance(global_step, Variable):
         raise ValueError("global_step is required for inverse_time_decay.")
 
-    if cycle:
-        div_res = layers.ceil(x=(global_step / decay_steps))
-        zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0)
-        one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0)
-
-        with layers.Switch() as switch:
-            with switch.case(layers.equal(x=global_step, y=zero_var)):
-                layers.assign(input=one_var, output=div_res)
-        decay_steps = decay_steps * div_res
-    else:
-        decay_steps_var = layers.fill_constant(
-            shape=[1], dtype='float32', value=float(decay_steps))
-        global_step = layers.elementwise_min(x=global_step, y=decay_steps_var)
-
-    return (learning_rate - end_learning_rate) * \
-           ((1 - global_step / decay_steps) ** power) + end_learning_rate
+    with init_on_cpu():
+        if cycle:
+            div_res = layers.ceil(x=(global_step / decay_steps))
+            zero_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=0.0)
+            one_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=1.0)
+
+            with layers.Switch() as switch:
+                with switch.case(layers.equal(x=global_step, y=zero_var)):
+                    layers.assign(input=one_var, output=div_res)
+            decay_steps = decay_steps * div_res
+        else:
+            decay_steps_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=float(decay_steps))
+            global_step = layers.elementwise_min(
+                x=global_step, y=decay_steps_var)
+
+        decayed_lr = (learning_rate - end_learning_rate) * \
+                     ((1 - global_step / decay_steps) ** power) + end_learning_rate
+    return decayed_lr
 
 
 def piecewise_decay(global_step, boundaries, values):
@@ -200,24 +215,27 @@ def piecewise_decay(global_step, boundaries, values):
     if not isinstance(global_step, Variable):
         raise ValueError("global_step is required for piecewise_decay.")
 
-    lr = layers.create_global_var(
-        shape=[1],
-        value=0.0,
-        dtype='float32',
-        persistable=True,
-        name="learning_rate")
-
-    with layers.Switch() as switch:
-        for i in range(len(boundaries)):
-            boundary_val = layers.fill_constant(
-                shape=[1], dtype='float32', value=float(boundaries[i]))
-            value_var = layers.fill_constant(
-                shape=[1], dtype='float32', value=float(values[i]))
-            with switch.case(layers.less_than(global_step, boundary_val)):
-                layers.assign(value_var, lr)
-        last_value_var = layers.fill_constant(
-            shape=[1], dtype='float32', value=float(values[len(values) - 1]))
-        with switch.default():
-            layers.assign(last_value_var, lr)
+    with init_on_cpu():
+        lr = layers.create_global_var(
+            shape=[1],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            name="learning_rate")
+
+        with layers.Switch() as switch:
+            for i in range(len(boundaries)):
+                boundary_val = layers.fill_constant(
+                    shape=[1], dtype='float32', value=float(boundaries[i]))
+                value_var = layers.fill_constant(
+                    shape=[1], dtype='float32', value=float(values[i]))
+                with switch.case(layers.less_than(global_step, boundary_val)):
+                    layers.assign(value_var, lr)
+            last_value_var = layers.fill_constant(
+                shape=[1],
+                dtype='float32',
+                value=float(values[len(values) - 1]))
+            with switch.default():
+                layers.assign(last_value_var, lr)
 
     return lr
diff --git a/python/paddle/v2/fluid/memory_optimization_transpiler.py b/python/paddle/v2/fluid/memory_optimization_transpiler.py
index 8bb8cf7b1a5ddf44427637229bdc31ac0e151e44..53e0991ee8c318e0c95018b57ad48f404ce8beae 100644
--- a/python/paddle/v2/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py
@@ -92,14 +92,13 @@ class ControlFlowGraph(object):
         live_in = defaultdict(set)
         live_out = defaultdict(set)
         while True:
-            for i in range(self.op_size):
+            for i in range(self.op_size, 0, -1):
                 live_in[i] = set(self._live_in[i])
                 live_out[i] = set(self._live_out[i])
-                self._live_in[i] = self._uses[i] | (
-                    self._live_out[i] - self._defs[i])
                 for s in self._successors[i]:
                     self._live_out[i] |= self._live_in[s]
-
+                self._live_in[i] = self._uses[i] | (
+                    self._live_out[i] - self._defs[i])
             if self._reach_fixed_point(live_in, live_out):
                 break
 
diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py
index cb63d43709e23ae04c4d23457bbb79e6f7f0ce3c..be7878f869b509fa1117e305aee662cc0123bbcc 100644
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -194,7 +194,7 @@ def scaled_dot_product_attention(queries,
 
     Returns:
 
-        Variable: A 3-D Tensor computed by multi-head scaled dot product
+        Variable: A 3-D Tensor computed by multi-head scaled dot product \
                   attention.
 
     Raises:
@@ -333,6 +333,7 @@ def scaled_dot_product_attention(queries,
             x=product, shape=[-1, product.shape[-1]], act="softmax"),
         shape=product.shape)
     if dropout_rate:
-        weights = layers.dropout(x, dropout_prob=dropout_rate, is_test=False)
+        weights = layers.dropout(
+            weights, dropout_prob=dropout_rate, is_test=False)
     ctx_multiheads = layers.matmul(weights, v)
     return __combine_heads(ctx_multiheads)
diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py
index 7844a4e2df1ce3989e48082f6472292560fbf1ee..f8a00e3a5fb4038a97a951a01c3a2f1a4488ae75 100644
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -190,6 +190,8 @@ class Optimizer(object):
         # Create any accumulators
         program = loss.block.program
         with program_guard(program, startup_program):
+            global_block = framework.default_main_program().global_block()
+            start = len(global_block.ops)
             self.helper = LayerHelper(self.__class__.__name__)
             self._create_accumulators(loss.block,
                                       [p[0] for p in parameters_and_grads])
@@ -203,19 +205,14 @@ class Optimizer(object):
                                                            param_and_grad)
                     optimize_ops.append(optimize_op)
 
-            # Returned list of ops can include more ops in addition
-            # to optimization ops
-            return_ops = optimize_ops
-
             # Get custom finish ops for subclasses
             # FIXME: Need to fix this once we figure out how to handle dependencies
-            finish_ops = self._finish_update(loss.block)
-            if finish_ops is not None:
-                return_ops += finish_ops
+            self._finish_update(loss.block)
 
             if self._global_step is not None:
-                return_ops.append(self._increment_global_step(loss.block))
-            return return_ops
+                self._increment_global_step(loss.block)
+            end = len(global_block.ops)
+            return global_block.slice_ops(start, end)
 
     def minimize(self,
                  loss,
diff --git a/python/paddle/v2/fluid/tests/book/.gitignore b/python/paddle/v2/fluid/tests/book/.gitignore
index f0b574b9396706a1d68393482296360362dca750..dd28d354f4160b4be68b46a7bebcdf2097d5811a 100644
--- a/python/paddle/v2/fluid/tests/book/.gitignore
+++ b/python/paddle/v2/fluid/tests/book/.gitignore
@@ -1 +1 @@
-recognize_digits_*.inference.model
+*.inference.model
diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
index 06860a2a465c6f8590336670372eb6ff43b10594..b3332b4810be3c95a5a48ebde92af89cafd94326 100644
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
@@ -15,15 +15,13 @@
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import contextlib
+import numpy
 import unittest
 import math
 import sys
 
 
-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-
+def train(use_cuda, save_dirname):
     x = fluid.layers.data(name='x', shape=[13], dtype='float32')
 
     y_predict = fluid.layers.fc(input=x, size=1, act=None)
@@ -51,14 +49,15 @@ def main(use_cuda):
 
     PASS_NUM = 100
     for pass_id in range(PASS_NUM):
-        fluid.io.save_persistables(exe, "./fit_a_line.model/")
-        fluid.io.load_persistables(exe, "./fit_a_line.model/")
         for data in train_reader():
             avg_loss_value, = exe.run(fluid.default_main_program(),
                                       feed=feeder.feed(data),
                                       fetch_list=[avg_cost])
             print(avg_loss_value)
             if avg_loss_value[0] < 10.0:
+                if save_dirname is not None:
+                    fluid.io.save_inference_model(save_dirname, ['x'],
+                                                  [y_predict], exe)
                 return
             if math.isnan(float(avg_loss_value)):
                 sys.exit("got NaN loss, training failed.")
@@ -66,6 +65,43 @@ def main(use_cuda):
         avg_loss_value[0]))
 
 
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    # The input's dimension should be 2-D and the second dim is 13
+    # The input data should be >= 0
+    batch_size = 10
+    tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
+    assert feed_target_names[0] == 'x'
+    results = exe.run(inference_program,
+                      feed={feed_target_names[0]: tensor_x},
+                      fetch_list=fetch_targets)
+    print("infer shape: ", results[0].shape)
+    print("infer results: ", results[0])
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    save_dirname = "fit_a_line.inference.model"
+
+    train(use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
 class TestFitALine(unittest.TestCase):
     def test_cpu(self):
         with self.program_scope_guard():
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
index 1491f7a8d5496445f8300d3db1d367bb3167d2c7..f33e81186bd8ad74b4d569383bce34afb2c40c24 100644
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -18,6 +18,7 @@ import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.dataset.conll05 as conll05
 import paddle.v2.fluid as fluid
+from paddle.v2.fluid.initializer import init_on_cpu
 import contextlib
 import time
 import unittest
@@ -167,7 +168,16 @@ def train(use_cuda, save_dirname=None):
 
     # TODO(qiao)
     # check other optimizers and check why out will be NAN
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
+    global_step = fluid.layers.create_global_var(
+        shape=[1], value=0, dtype='float32', force_cpu=True, persistable=True)
+    sgd_optimizer = fluid.optimizer.SGD(
+        learning_rate=fluid.learning_rate_decay.exponential_decay(
+            learning_rate=0.0001,
+            global_step=global_step,
+            decay_steps=100000,
+            decay_rate=0.5,
+            staircase=True),
+        global_step=global_step)
     sgd_optimizer.minimize(avg_cost)
 
     # TODO(qiao)
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
index 6f9d85faff924c10d5179545e67f0d80ff837ce5..244c1749cd522faec26f8cf8e71f7469843f534e 100644
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
@@ -174,8 +174,9 @@ def infer(use_cuda, save_dirname=None, param_filename=None):
 
     # The input's dimension of conv should be 4-D or 5-D.
     # Use normilized image pixels as input data, which should be in the range [-1.0, 1.0].
+    batch_size = 1
     tensor_img = numpy.random.uniform(-1.0, 1.0,
-                                      [1, 1, 28, 28]).astype("float32")
+                                      [batch_size, 1, 28, 28]).astype("float32")
 
     # Construct feed as a dictionary of {feed_target_name: feed_target_data}
     # and results will contain a list of data corresponding to fetch_targets.
diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
index 9c7ab7d6318472ac9378dd1966b75d19b5505bf5..612d51e08e4fc05b397df9d8aaaf675ba9d783af 100644
--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
@@ -16,7 +16,7 @@ import math
 import sys
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
+import paddle.v2.fluid as fluid
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.nets as nets
@@ -104,7 +104,8 @@ def get_mov_combined_features():
 
     CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
 
-    category_id = layers.data(name='category_id', shape=[1], dtype='int64')
+    category_id = layers.data(
+        name='category_id', shape=[1], dtype='int64', lod_level=1)
 
     mov_categories_emb = layers.embedding(
         input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
@@ -114,7 +115,8 @@ def get_mov_combined_features():
 
     MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
 
-    mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64')
+    mov_title_id = layers.data(
+        name='movie_title', shape=[1], dtype='int64', lod_level=1)
 
     mov_title_emb = layers.embedding(
         input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
@@ -144,23 +146,22 @@ def model():
     scale_infer = layers.scale(x=inference, scale=5.0)
 
     label = layers.data(name='score', shape=[1], dtype='float32')
-
     square_cost = layers.square_error_cost(input=scale_infer, label=label)
-
     avg_cost = layers.mean(x=square_cost)
 
-    return avg_cost
+    return scale_infer, avg_cost
+
 
+def train(use_cuda, save_dirname):
+    scale_infer, avg_cost = model()
+
+    # test program
+    test_program = fluid.default_main_program().clone()
 
-def main():
-    cost = model()
     sgd_optimizer = SGDOptimizer(learning_rate=0.2)
-    opts = sgd_optimizer.minimize(cost)
+    opts = sgd_optimizer.minimize(avg_cost)
 
-    if USE_GPU:
-        place = core.CUDAPlace(0)
-    else:
-        place = core.CPUPlace()
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     exe = Executor(place)
     exe.run(framework.default_startup_program())
@@ -169,6 +170,8 @@ def main():
         paddle.reader.shuffle(
             paddle.dataset.movielens.train(), buf_size=8192),
         batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
 
     feeding = {
         'user_id': 0,
@@ -184,7 +187,7 @@ def main():
     def func_feed(feeding, data):
         feed_tensors = {}
         for (key, idx) in feeding.iteritems():
-            tensor = core.LoDTensor()
+            tensor = fluid.LoDTensor()
             if key != "category_id" and key != "movie_title":
                 if key == "score":
                     numpy_data = np.array(map(lambda x: x[idx], data)).astype(
@@ -211,16 +214,117 @@ def main():
 
     PASS_NUM = 100
     for pass_id in range(PASS_NUM):
-        for data in train_reader():
-            outs = exe.run(framework.default_main_program(),
+        for batch_id, data in enumerate(train_reader()):
+            # train a mini-batch
+            outs = exe.run(program=fluid.default_main_program(),
                            feed=func_feed(feeding, data),
-                           fetch_list=[cost])
+                           fetch_list=[avg_cost])
             out = np.array(outs[0])
-            if out[0] < 6.0:
-                # if avg cost less than 6.0, we think our code is good.
-                exit(0)
+            if (batch_id + 1) % 10 == 0:
+                avg_cost_set = []
+                for test_data in test_reader():
+                    avg_cost_np = exe.run(program=test_program,
+                                          feed=func_feed(feeding, test_data),
+                                          fetch_list=[avg_cost])
+                    avg_cost_set.append(avg_cost_np[0])
+                    break  # test only 1 segment for speeding up CI
+
+                # get test avg_cost
+                test_avg_cost = np.array(avg_cost_set).mean()
+                if test_avg_cost < 6.0:
+                    # if avg_cost less than 6.0, we think our code is good.
+                    if save_dirname is not None:
+                        fluid.io.save_inference_model(save_dirname, [
+                            "user_id", "gender_id", "age_id", "job_id",
+                            "movie_id", "category_id", "movie_title"
+                        ], [scale_infer], exe)
+                    return
+
             if math.isnan(float(out[0])):
                 sys.exit("got NaN loss, training failed.")
 
 
-main()
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded
+    # data using feed operators), and the fetch_targets (variables that
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    def create_lod_tensor(data, lod=None):
+        tensor = fluid.LoDTensor()
+        if lod is None:
+            # Tensor, the shape is [batch_size, 1]
+            index = 0
+            lod_0 = [index]
+            for l in range(len(data)):
+                index += 1
+                lod_0.append(index)
+            lod = [lod_0]
+        tensor.set_lod(lod)
+
+        flattened_data = np.concatenate(data, axis=0).astype("int64")
+        flattened_data = flattened_data.reshape([len(flattened_data), 1])
+        tensor.set(flattened_data, place)
+        return tensor
+
+    # Use the first data from paddle.dataset.movielens.test() as input
+    assert feed_target_names[0] == "user_id"
+    user_id = create_lod_tensor([[1]])
+
+    assert feed_target_names[1] == "gender_id"
+    gender_id = create_lod_tensor([[1]])
+
+    assert feed_target_names[2] == "age_id"
+    age_id = create_lod_tensor([[0]])
+
+    assert feed_target_names[3] == "job_id"
+    job_id = create_lod_tensor([[10]])
+
+    assert feed_target_names[4] == "movie_id"
+    movie_id = create_lod_tensor([[783]])
+
+    assert feed_target_names[5] == "category_id"
+    category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
+
+    assert feed_target_names[6] == "movie_title"
+    movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
+                                    [[0, 5]])
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    results = exe.run(inference_program,
+                      feed={
+                          feed_target_names[0]: user_id,
+                          feed_target_names[1]: gender_id,
+                          feed_target_names[2]: age_id,
+                          feed_target_names[3]: job_id,
+                          feed_target_names[4]: movie_id,
+                          feed_target_names[5]: category_id,
+                          feed_target_names[6]: movie_title
+                      },
+                      fetch_list=fetch_targets,
+                      return_numpy=False)
+    print("inferred score: ", np.array(results[0]))
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the inference model
+    save_dirname = "recommender_system.inference.model"
+
+    train(use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
+if __name__ == '__main__':
+    main(USE_GPU)
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
index 9c5cb667aed7456b54d32dcd650852cfdbd6cce1..6e0206d41db6265e926991fe35cd53513bd3417d 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
@@ -17,6 +17,7 @@ import paddle.v2.fluid as fluid
 import paddle.v2 as paddle
 import contextlib
 import math
+import numpy as np
 import sys
 
 
@@ -43,7 +44,7 @@ def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
     adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
     adam_optimizer.minimize(avg_cost)
     accuracy = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy
+    return avg_cost, accuracy, prediction
 
 
 def stacked_lstm_net(data,
@@ -81,13 +82,18 @@ def stacked_lstm_net(data,
     adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
     adam_optimizer.minimize(avg_cost)
     accuracy = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy
+    return avg_cost, accuracy, prediction
 
 
-def main(word_dict, net_method, use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
+def create_random_lodtensor(lod, place, low, high):
+    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
+    res = fluid.LoDTensor()
+    res.set(data, place)
+    res.set_lod([lod])
+    return res
 
+
+def train(word_dict, net_method, use_cuda, save_dirname=None):
     BATCH_SIZE = 128
     PASS_NUM = 5
     dict_dim = len(word_dict)
@@ -96,7 +102,7 @@ def main(word_dict, net_method, use_cuda):
     data = fluid.layers.data(
         name="words", shape=[1], dtype="int64", lod_level=1)
     label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    cost, acc_out = net_method(
+    cost, acc_out, prediction = net_method(
         data, label, input_dim=dict_dim, class_dim=class_dim)
 
     train_data = paddle.batch(
@@ -116,6 +122,9 @@ def main(word_dict, net_method, use_cuda):
                                         fetch_list=[cost, acc_out])
             print("cost=" + str(cost_val) + " acc=" + str(acc_val))
             if cost_val < 0.4 and acc_val > 0.8:
+                if save_dirname is not None:
+                    fluid.io.save_inference_model(save_dirname, ["words"],
+                                                  prediction, exe)
                 return
             if math.isnan(float(cost_val)):
                 sys.exit("got NaN loss, training failed.")
@@ -123,6 +132,49 @@ def main(word_dict, net_method, use_cuda):
         net_method.__name__))
 
 
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    lod = [0, 4, 10]
+    word_dict = paddle.dataset.imdb.word_dict()
+    tensor_words = create_random_lodtensor(
+        lod, place, low=0, high=len(word_dict) - 1)
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    assert feed_target_names[0] == "words"
+    results = exe.run(inference_program,
+                      feed={feed_target_names[0]: tensor_words},
+                      fetch_list=fetch_targets,
+                      return_numpy=False)
+    print(results[0].lod())
+    np_data = np.array(results[0])
+    print("Inference Shape: ", np_data.shape)
+    print("Inference results: ", np_data)
+
+
+def main(word_dict, net_method, use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    save_dirname = "understand_sentiment.inference.model"
+
+    train(word_dict, net_method, use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
 class TestUnderstandSentiment(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py
index f013d7f1551bdbfb2f725809e2fb4d7d686560fe..69bfbcee69a08f57e4754f1a94f85534be4baac6 100644
--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
@@ -1,6 +1,5 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
+# # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
@@ -16,14 +15,67 @@ import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import unittest
 import os
+import numpy as np
 import math
 import sys
 
 
-def main(use_cuda, is_sparse, parallel):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
+def create_random_lodtensor(lod, place, low, high):
+    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
+    res = fluid.LoDTensor()
+    res.set(data, place)
+    res.set_lod([lod])
+    return res
+
+
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
         return
 
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict) - 1
+
+    # Setup input, by creating 4 words, and setting up lod required for 
+    # lookup_table_op
+    lod = [0, 1]
+    first_word = create_random_lodtensor(lod, place, low=0, high=dict_size)
+    second_word = create_random_lodtensor(lod, place, low=0, high=dict_size)
+    third_word = create_random_lodtensor(lod, place, low=0, high=dict_size)
+    fourth_word = create_random_lodtensor(lod, place, low=0, high=dict_size)
+
+    assert feed_target_names[0] == 'firstw'
+    assert feed_target_names[1] == 'secondw'
+    assert feed_target_names[2] == 'thirdw'
+    assert feed_target_names[3] == 'forthw'
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    results = exe.run(inference_program,
+                      feed={
+                          feed_target_names[0]: first_word,
+                          feed_target_names[1]: second_word,
+                          feed_target_names[2]: third_word,
+                          feed_target_names[3]: fourth_word
+                      },
+                      fetch_list=fetch_targets,
+                      return_numpy=False)
+    print(results[0].lod())
+    np_data = np.array(results[0])
+    print("Inference Shape: ", np_data.shape)
+    print("Inference results: ", np_data)
+
+
+def train(use_cuda, is_sparse, parallel, save_dirname):
     PASS_NUM = 100
     EMBED_SIZE = 32
     HIDDEN_SIZE = 256
@@ -67,7 +119,7 @@ def main(use_cuda, is_sparse, parallel):
                                        act='softmax')
         cost = fluid.layers.cross_entropy(input=predict_word, label=words[4])
         avg_cost = fluid.layers.mean(x=cost)
-        return avg_cost
+        return avg_cost, predict_word
 
     word_dict = paddle.dataset.imikolov.build_dict()
     dict_size = len(word_dict)
@@ -79,13 +131,13 @@ def main(use_cuda, is_sparse, parallel):
     next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
 
     if not parallel:
-        avg_cost = __network__(
+        avg_cost, predict_word = __network__(
             [first_word, second_word, third_word, forth_word, next_word])
     else:
         places = fluid.layers.get_places()
         pd = fluid.layers.ParallelDo(places)
         with pd.do():
-            avg_cost = __network__(
+            avg_cost, predict_word = __network__(
                 map(pd.read_input, [
                     first_word, second_word, third_word, forth_word, next_word
                 ]))
@@ -113,6 +165,10 @@ def main(use_cuda, is_sparse, parallel):
                                   feed=feeder.feed(data),
                                   fetch_list=[avg_cost])
             if avg_cost_np[0] < 5.0:
+                if save_dirname is not None:
+                    fluid.io.save_inference_model(save_dirname, [
+                        'firstw', 'secondw', 'thirdw', 'forthw'
+                    ], [predict_word], exe)
                 return
             if math.isnan(float(avg_cost_np[0])):
                 sys.exit("got NaN loss, training failed.")
@@ -120,6 +176,14 @@ def main(use_cuda, is_sparse, parallel):
     raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0]))
 
 
+def main(use_cuda, is_sparse, parallel):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    save_dirname = "word2vec.inference.model"
+    train(use_cuda, is_sparse, parallel, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
 FULL_TEST = os.getenv('FULL_TEST',
                       '0').lower() in ['true', '1', 't', 'y', 'yes', 'on']
 SKIP_REASON = "Only run minimum number of tests in CI server, to make CI faster"
@@ -142,7 +206,8 @@ def inject_test_method(use_cuda, is_sparse, parallel):
             with fluid.program_guard(prog, startup_prog):
                 main(use_cuda=use_cuda, is_sparse=is_sparse, parallel=parallel)
 
-    if use_cuda and is_sparse and parallel:
+    # run only 2 cases: use_cuda is either True or False
+    if is_sparse == False and parallel == False:
         fn = __impl__
     else:
         # skip the other test when on CI server
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index 7ad5e2c594f24999e298533b6c05ba688a935f0b..045db8390cd52689a2a803c3387c90776a44ee73 100644
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -15,6 +15,8 @@
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import math
+import sys
 
 # need to fix random seed and training data to compare the loss
 # value accurately calculated by the default and the memory optimization
@@ -63,4 +65,6 @@ for pass_id in range(PASS_NUM):
 
         if avg_loss_value[0] < 10.0:
             exit(0)  # if avg cost less than 10.0, we think our code is good.
+        if math.isnan(float(avg_loss_value)):
+            sys.exit("got NaN loss, training failed.")
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
index 26673afd83c48328c3f354e82bfa3725aa4805b5..9fbb36d3638bd537020247d6f762afd4ed5d402f 100644
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -18,6 +18,8 @@ import sys
 
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import math
+import sys
 
 # need to fix random seed and training data to compare the loss
 # value accurately calculated by the default and the memory optimization
@@ -152,7 +154,10 @@ for pass_id in range(PASS_NUM):
         print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
             pass_acc))
         # this model is slow, so if we can train two mini batch, we think it works properly.
+
         if i > 2:
             exit(0)
+        if math.isnan(float(loss)):
+            sys.exit("got NaN loss, training failed.")
         i += 1
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
index ffd53e7a78142162317a677de49c1821635a65b5..48abaa8d87563b7132c5d8962bc33283a104e67a 100644
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -19,6 +19,8 @@ import paddle.v2.fluid.core as core
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
+import math
+import sys
 
 dict_size = 30000
 source_dict_dim = target_dict_dim = dict_size
@@ -137,6 +139,8 @@ def main():
                   " avg_cost=" + str(avg_cost_val))
             if batch_id > 2:
                 exit(0)
+            if math.isnan(float(avg_cost_val)):
+                sys.exit("got NaN loss, training failed.")
             batch_id += 1
 
 
diff --git a/python/paddle/v2/fluid/tests/notest_csp.py b/python/paddle/v2/fluid/tests/notest_csp.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fe234a20b5222eb85e6bcea2fcb05c53ddd57e9
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/notest_csp.py
@@ -0,0 +1,37 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid as fluid
+
+
+class TestCSPFramework(unittest.TestCase):
+    def daisy_chain(self):
+        n = 10000
+        leftmost = fluid.make_channel(dtype=int)
+        right = leftmost
+        left = leftmost
+        with fluid.While(steps=n):
+            right = fluid.make_channel(dtype=int)
+            with fluid.go():
+                fluid.send(left, 1 + fluid.recv(right))
+            left = right
+
+        with fluid.go():
+            fluid.send(right, 1)
+        fluid.Print(fluid.recv(leftmost))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py
index 3f6d7070c2987d0557c60db84a2c679cd2cfe36b..f8475813c0cb7b8c8da8c9c7e0a4626615e609ac 100644
--- a/python/paddle/v2/fluid/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
@@ -326,7 +326,8 @@ class OpTest(unittest.TestCase):
                 self.assertTrue(
                     np.allclose(
                         actual_t, expect_t, atol=atol),
-                    "Output (" + out_name + ") has diff at " + str(place))
+                    "Output (" + out_name + ") has diff at " + str(place) +
+                    str(actual_t) + str(expect_t))
                 if isinstance(expect, tuple):
                     self.assertListEqual(actual.lod(), expect[1],
                                          "Output (" + out_name +
diff --git a/python/paddle/v2/fluid/tests/test_cpp_reader.py b/python/paddle/v2/fluid/tests/test_cpp_reader.py
index e71c3a290c9b120749a5190a246c5d76b7bf1955..66d6c28ef7d32c33e0f4c010c876dad29a3a0eb6 100644
--- a/python/paddle/v2/fluid/tests/test_cpp_reader.py
+++ b/python/paddle/v2/fluid/tests/test_cpp_reader.py
@@ -21,7 +21,8 @@ block = prog.current_block()
 
 random_reader = block.create_var(
     type=fluid.core.VarDesc.VarType.READER, name="RandomDataGenerator")
-random_reader.desc.set_lod_levels([0, 0])
+random_reader.desc.set_dtypes(
+    [fluid.core.DataType.FP32, fluid.core.DataType.FP32])
 
 create_random_data_generator_op = block.append_op(
     type="create_random_data_generator",
@@ -30,25 +31,32 @@ create_random_data_generator_op = block.append_op(
         "shape_concat": [1, 2, 1, 1],
         "ranks": [2, 2],
         "min": 0.0,
-        "max": 1.0
+        "max": 1.0,
+        'lod_levels': [0, 0]
     })
+shuffle_reader = block.create_var(
+    type=fluid.core.VarDesc.VarType.READER, name="ShuffleReader")
 
-out1 = block.create_var(
-    type=fluid.core.VarDesc.VarType.LOD_TENSOR,
-    name="Out1",
-    shape=[10, 2],
-    dtype="float32",
-    lod_level=1)
-out2 = block.create_var(
-    type=fluid.core.VarDesc.VarType.LOD_TENSOR,
-    name="Out2",
-    shape=[10, 1],
-    dtype="float32",
-    lod_level=1)
+create_shuffle_reader_op = block.append_op(
+    type="create_shuffle_reader",
+    inputs={"UnderlyingReader": random_reader},
+    outputs={"Out": shuffle_reader},
+    attrs={"buffer_size": 7})
+
+batch_reader = block.create_var(
+    type=fluid.core.VarDesc.VarType.READER, name="BatchReader")
+
+create_batch_reader_op = block.append_op(
+    type="create_batch_reader",
+    inputs={"UnderlyingReader": shuffle_reader},
+    outputs={"Out": batch_reader},
+    attrs={"batch_size": 10})
+
+out1 = block.create_var(type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out1")
+out2 = block.create_var(type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out2")
 
 read_op = block.append_op(
-    type="read",
-    inputs={"Reader": random_reader},
+    type="read", inputs={"Reader": batch_reader},
     outputs={"Out": [out1, out2]})
 
 place = fluid.CPUPlace()
@@ -56,7 +64,9 @@ exe = fluid.Executor(place)
 
 [res1, res2] = exe.run(prog, fetch_list=[out1, out2])
 
-if len(res1) == 0 or len(res2) == 0:
+test_pass = res1.shape == (10, 2) and res2.shape == (10, 1)
+
+if not test_pass:
     exit(1)
 
 exit(0)
diff --git a/python/paddle/v2/fluid/tests/test_cumsum_op.py b/python/paddle/v2/fluid/tests/test_cumsum_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e45ef457306bbdd33508e560cb8d339e801bb5e4
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_cumsum_op.py
@@ -0,0 +1,127 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestSumOp1(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.attrs = {'axis': 2}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=2)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp2(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.attrs = {'axis': -1, 'reverse': True}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.outputs = {
+            'Out': np.flip(
+                np.flip(
+                    self.inputs['X'], axis=2).cumsum(axis=2), axis=2)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp3(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.attrs = {'axis': 1}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp4(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.attrs = {'axis': 0}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp5(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.inputs = {'X': np.random.random((5, 6)).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp7(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.inputs = {'X': np.random.random((6)).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp8(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.attrs = {'axis': 2, "exclusive": True}
+        a = np.random.random((5, 6, 3)).astype("float64")
+        self.inputs = {'X': a}
+        self.outputs = {
+            'Out': np.concatenate(
+                (np.zeros(
+                    (5, 6, 1), dtype=np.float64), a[:, :, :-1].cumsum(axis=2)),
+                axis=2)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py
index 68cf8673cd46677065588f652482cd0df08b3450..4460ffaf9c46966178497419a35ef4044464ac9f 100644
--- a/python/paddle/v2/fluid/tests/test_layer_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py
@@ -20,6 +20,8 @@ import paddle.v2.fluid.core as core
 from paddle.v2.fluid.op import Operator
 from paddle.v2.fluid.framework import grad_var_name
 
+np.random.random(123)
+
 
 def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
     x_shape = x.shape
@@ -62,9 +64,9 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
 
     grad_x = dx_end + d_mean + d_std
 
-    grad_y.shape = x_shape
-    x.shape = x_shape
+    grad_x.shape, x.shape, grad_y.shape = x_shape, x_shape, x_shape
     scale.shape = scale_shape
+    var.shape, mean.shape = [N, ], [N, ]
     return grad_x, d_scale, d_bias
 
 
@@ -112,10 +114,7 @@ def set_output_grad(scope, outputs, place, feed_dict=None):
 
 class TestLayerNormdOp(OpTest):
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
-        self.assertTrue(
-            np.allclose(
-                np.array(tensor).reshape(np_array.shape), np_array, atol=atol),
-            msg)
+        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
 
     def __assert_grad_close(self,
                             tensor,
@@ -123,7 +122,7 @@ class TestLayerNormdOp(OpTest):
                             name,
                             place,
                             max_relative_error=0.02):
-        a = np.array(tensor).reshape(np_array.shape)
+        a = np.array(tensor)
         b = np_array
         abs_a = np.abs(a)
         abs_a[abs_a < 1e-5] = 1
@@ -151,7 +150,7 @@ class TestLayerNormdOp(OpTest):
             x_shape = shape
             D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
             scale_shape = [D]
-            np.random.random(123)
+
             x_val = np.random.random_sample(x_shape).astype(np.float32)
             scale_val = np.random.random_sample(scale_shape).astype(np.float32)
             bias_val = np.random.random_sample(scale_shape).astype(np.float32)
diff --git a/python/paddle/v2/fluid/tests/test_optimizer.py b/python/paddle/v2/fluid/tests/test_optimizer.py
index 480ee7091579ba171ca957cb4d25f0034e0534c0..dc6b84dcdc04dd185d97c3cc4b9f00305a911efb 100644
--- a/python/paddle/v2/fluid/tests/test_optimizer.py
+++ b/python/paddle/v2/fluid/tests/test_optimizer.py
@@ -42,9 +42,9 @@ class TestOptimizer(unittest.TestCase):
             type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
         opts, _ = sgd_optimizer.minimize(mean_out, init_program)
-        self.assertEqual(len(opts), 1)
-        sgd_op = opts[0]
-        self.assertEqual(sgd_op.type, "sgd")
+        self.assertEqual(len(opts), 3)
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "sgd"])
 
     def test_sgd_optimizer_with_global_step(self):
         init_program = framework.Program()
@@ -72,11 +72,10 @@ class TestOptimizer(unittest.TestCase):
         sgd_optimizer = optimizer.SGDOptimizer(
             learning_rate=learning_rate, global_step=global_step)
         opts, _ = sgd_optimizer.minimize(mean_out, init_program)
-        self.assertEqual(len(opts), 2)
-        sgd_op = opts[0]
-        self.assertEqual(sgd_op.type, "sgd")
-        increment_op = opts[1]
-        self.assertEqual(increment_op.type, "increment")
+        self.assertEqual(len(opts), 4)
+        self.assertEqual(
+            [op.type for op in opts],
+            ["fill_constant", "elementwise_mul", "sgd", "increment"])
 
         # Check init_program
         init_ops = init_program.global_block().ops
@@ -121,9 +120,10 @@ class TestMomentumOptimizer(unittest.TestCase):
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
         opts = momentum_optimizer.create_optimization_pass(
             params_grads, mul_out, init_program)
-        self.assertEqual(len(opts), 1)
-        sgd_op = opts[0]
-        self.assertEqual(sgd_op.type, "momentum")
+        self.assertEqual(len(opts), 3)
+        sgd_op = opts[-1]
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "momentum"])
         self.assertFalse(sgd_op.attr('use_nesterov'))
 
         # Check accumulators
@@ -170,9 +170,10 @@ class TestMomentumOptimizer(unittest.TestCase):
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
         opts = momentum_optimizer.create_optimization_pass(
             params_grads, mul_out, init_program)
-        self.assertEqual(len(opts), 1)
-        sgd_op = opts[0]
-        self.assertEqual(sgd_op.type, "momentum")
+        self.assertEqual(len(opts), 3)
+        sgd_op = opts[-1]
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "momentum"])
         self.assertTrue(sgd_op.attr('use_nesterov'))
 
         # Check accumulators
@@ -228,9 +229,9 @@ class TestAdagradOptimizer(unittest.TestCase):
         self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
         opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
                                                           init_program)
-        self.assertEqual(len(opts), 1)
-        adagrad_op = opts[0]
-        self.assertEqual(adagrad_op.type, "adagrad")
+        self.assertEqual(len(opts), 3)
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "adagrad"])
 
         # Check accumulators
         accumulators = adagrad_optimizer.get_accumulators()
@@ -288,9 +289,10 @@ class TestAdamOptimizer(unittest.TestCase):
         self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
         opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
                                                        init_program)
-        self.assertEqual(len(opts), 3)
-        adam_op = opts[0]
-        self.assertEqual(adam_op.type, "adam")
+        self.assertEqual(len(opts), 5)
+        self.assertEqual(
+            [op.type for op in opts],
+            ["fill_constant", "elementwise_mul", "adam", "scale", "scale"])
 
         # Check accumulators
         accumulators = adam_optimizer.get_accumulators()
@@ -350,9 +352,10 @@ class TestAdamaxOptimizer(unittest.TestCase):
         self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
         opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
                                                          init_program)
-        self.assertEqual(len(opts), 2)
-        adam_op = opts[0]
-        self.assertEqual(adam_op.type, "adamax")
+        self.assertEqual(len(opts), 4)
+        self.assertEqual(
+            [op.type for op in opts],
+            ["fill_constant", "elementwise_mul", "adamax", "scale"])
 
         # Check accumulators
         accumulators = adamax_optimizer.get_accumulators()
@@ -409,9 +412,10 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
         self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
         opts = decayed_adagrad_optimizer.create_optimization_pass(
             params_grads, mul_out, init_program)
-        self.assertEqual(len(opts), 1)
-        decayed_adagrad_op = opts[0]
-        self.assertEqual(decayed_adagrad_op.type, "decayed_adagrad")
+        self.assertEqual(len(opts), 3)
+        self.assertEqual(
+            [op.type for op in opts],
+            ["fill_constant", "elementwise_mul", "decayed_adagrad"])
 
         # Check accumulators
         accumulators = decayed_adagrad_optimizer.get_accumulators()
diff --git a/python/paddle/v2/fluid/tests/test_parallel_op.py b/python/paddle/v2/fluid/tests/test_parallel_op.py
index dc8c8060741464095aa2485e77f76c8bb6f42f88..2914c8dbaaebbf58349051ec65d785ab8d0b7df2 100644
--- a/python/paddle/v2/fluid/tests/test_parallel_op.py
+++ b/python/paddle/v2/fluid/tests/test_parallel_op.py
@@ -210,5 +210,5 @@ class ParallelOpTestMultipleInput(BaseParallelForTest):
             fetch=['fc1.w@GRAD', 'fc2.w@GRAD', 'fc3.w@GRAD'])
 
 
-if __name__ == '__main__':
-    unittest.main()
+#if __name__ == '__main__':
+#    unittest.main()
diff --git a/python/setup.py.in b/python/setup.py.in
index 65ec58ecf98e693ecf02922129f6eef13cbe5303..5a0d99995439135ce814e58fcfbcdc337965b164 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -109,7 +109,7 @@ setup(name='${PACKAGE_NAME}',
           '': '${CMAKE_CURRENT_SOURCE_DIR}',
           # The paddle.v2.fluid.proto will be generated while compiling.
           # So that package points to other directory.
-          'paddle.v2.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/framework',
+          'paddle.v2.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
           'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
       },
       scripts=paddle_bins,